Exemplo n.º 1
0
def _fix_normal_cancer_names(filename, normal_sample, cancer_sample):
    # VarScan calls the samples NORMAL, TUMOR.  Fix them.
    from genomicode import hashlib

    lines, header_i, samples = _read_vcf(filename)

    normal_sample_h = hashlib.hash_var(normal_sample)
    cancer_sample_h = hashlib.hash_var(cancer_sample)

    header = lines[header_i]
    header1 = header[:-len(samples)]
    header2 = header[-len(samples):]

    assert sorted(header2) == ["NORMAL", "TUMOR"], header2
    for i in range(len(header2)):
        if header2[i] == "NORMAL":
            header2[i] = normal_sample_h
        if header2[i] == "TUMOR":
            header2[i] = cancer_sample_h
    lines[header_i] = header1 + header2

    handle = open(filename, 'w')
    for x in lines:
        print >> handle, "\t".join(x)
    handle.close()
Exemplo n.º 2
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample",
                                                      not_empty=True)

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_pyspp_command(treat_filename,
                                 control_filename,
                                 out_path,
                                 num_procs=num_cores)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "binding.positions.txt",
            #"broadPeak",
            "crosscorrelation.pdf",
            "density.wig",
            "enrichment.estimates.wig",
            "enrichment.wig",
            #"narrowPeak",   # might be empty if no peaks found
            log_file,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
Exemplo n.º 3
0
def cmp_sample(x, y,
               case_insensitive, hash_samples, ignore_nonalnum, ignore_blank):
    from genomicode import hashlib

    if ignore_blank and x.strip():
        return False

    if case_insensitive:
        x, y = x.upper(), y.upper()
    if hash_samples:
        x, y = hashlib.hash_var(x), hashlib.hash_var(y)
    if ignore_nonalnum:
        x, y = strip_nonalnum(x), strip_nonalnum(y)
    return x == y
Exemplo n.º 4
0
def find_sample(sample_list, sample, case_insensitive, hash_samples,
                ignore_nonalnum, ignore_blank):
    # Return the index of this sample or -1.
    from genomicode import hashlib

    if ignore_blank and not sample.strip():
        return -1

    sample_list_cmp = sample_list
    sample_cmp = sample
    if case_insensitive:
        sample_list_cmp = [x.upper() for x in sample_list_cmp]
        sample_cmp = sample_cmp.upper()
    if hash_samples:
        #sample_list_cmp = [hashlib.hash_var(x) for x in sample_list_cmp]
        sample_list_cmp = hashlib.hash_var_many(sample_list_cmp)
        sample_cmp = hashlib.hash_var(sample_cmp)
    if ignore_nonalnum:
        sample_list_cmp = [strip_nonalnum(x) for x in sample_list_cmp]
        sample_cmp = strip_nonalnum(sample_cmp)

    ## Too slow.
    ##for i, x in enumerate(sample_list_cmp):
    ##    if x == sample_cmp:
    ##        return i
    ##        #I.append(i)
    ##return -1
    try:
        return sample_list_cmp.index(sample_cmp)
    except ValueError, x:
        pass
Exemplo n.º 5
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        #from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        # TODO: Merge with merge_variants_snp.py.
        #CALLERS = [
        #    "gatk", "platypus", "varscan",
        #    ]
        vcf_paths = [x.identifier for x in antecedents]
        nodes = [x.data for x in antecedents]
        CALLERS = [x.attributes["caller"] for x in nodes]
        assert len(CALLERS) == len(vcf_paths)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file)
        jobs = []
        for i, caller in enumerate(CALLERS):
            inpath = vcf_paths[i]
            caller_h = hashlib.hash_var(caller)
            
            vcf_files = filelib.list_files_in_path(
                inpath, endswith=".vcf", toplevel_only=True)
            for file_ in vcf_files:
                # IN_FILE:   <inpath>/<sample>.vcf
                # OUT_FILE:  <out_path>/<caller>.vcf/<sample>.vcf
                p, sample, e = mlib.splitpath(file_)
                assert e == ".vcf"
                out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h)
                out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample)

                x = filelib.GenericObject(
                    sample=sample, caller=caller,
                    out_vcf_path=out_vcf_path, in_vcf_file=file_,
                    out_vcf_file=out_vcf_file)
                jobs.append(x)
                
        # Make sure the same samples are found in all callers.
        caller2samples = {}
        for j in jobs:
            if j.caller not in caller2samples:
                caller2samples[j.caller] = []
            caller2samples[j.caller].append(j.sample)
        comp_samples = None
        for caller, samples in caller2samples.iteritems():
            samples = sorted(samples)
            if comp_samples is None:
                comp_samples = samples
            assert comp_samples == samples, "%s %s" % (comp_samples, samples)

        for j in jobs:
            filelib.safe_mkdir(j.out_vcf_path)
            os.symlink(j.in_vcf_file, j.out_vcf_file)

        return metadata
Exemplo n.º 6
0
def write_multi_cls_file(outhandle, names, classes):
    # Only handles categorical CLS files with any number of classes.
    # names is a unique list of the names of the possible classes.
    # classes should be a list of [0 - len(class_names)-1] or class
    # names.
    from genomicode import hashlib
    from genomicode import jmath

    # Not handled: if class names are numbers.
    for x in names:
        assert not jmath.is_int(x) is None, "Invalid class name: %s" % x

    # Make sure the class names is unique.
    x = {}.fromkeys(names)
    assert len(names) == len(x), "class names not unique"

    # Hash the class names.
    names_h = [hashlib.hash_var(x) for x in names]
    # Make sure hashed names are still unique.
    x = {}.fromkeys(names_h)
    assert len(names_h) == len(x)

    # Make sure the class assignments are valid.
    assert classes
    for x in classes:
        if jmath.is_int(x):
            x = int(x)
            assert x >= 0 and x < len(names)
        else:
            assert x in names

    # Convert class assignments to numbers, if necessary.
    classes_int = []
    for x in classes:
        if not jmath.is_int(x):
            x = names.index(x)
        classes_int.append(x)

    # Some GenePattern tools require the first sample to be in class
    # 0.  Make sure this is true.
    assert classes_int[0] == 0, "First sample must be in first class."

    # Write the file.
    # Space or tab-delimited format.
    # <num samples> <num classes> 1
    # # <class name 0> <class name 1> ...
    # <0/1 or class name> ...
    if type(outhandle) is type(""):
        outhandle = open(outhandle, 'w')
    num_classes = len(names)
    num_samples = len(classes)
    x = [num_samples, num_classes, 1] + [""] * (num_samples - 3)
    print >> outhandle, "\t".join(map(str, x))

    x = ["#"] + names_h + [""] * (num_samples - 3)
    print >> outhandle, "\t".join(x)

    print >> outhandle, "\t".join(map(str, classes_int))
Exemplo n.º 7
0
def _process_sample(sample, case_insensitive, hash_samples, ignore_nonalnum):
    from genomicode import hashlib

    x = sample
    if case_insensitive:
        x = x.upper()
    if hash_samples:
        x = hashlib.hash_var(x)
    if ignore_nonalnum:
        x = strip_nonalnum(x)
    return x
Exemplo n.º 8
0
def check_matrix(X):
    import re
    import arrayio
    import copy
    from genomicode import hashlib
    from genomicode import AnnotationMatrix

    assert arrayio.gct_format.is_matrix(X)

    # Make sure gene IDs (NAME) is unique and non-empty.
    assert X.row_names()[0].upper() == "NAME", \
           "Header of first column should be: NAME"
    seen = {}
    for i, name in enumerate(X.row_names("NAME")):
        assert name.strip(), "Empty gene ID in row %d." % (i + 1)
        assert name not in seen, "Duplicate gene ID: %s" % name
        seen[name] = 1

    # Make sure sample names don't contain spaces or other
    # punctuation.  GSEA seems to be sensitive to these things.
    sample_names = X.col_names(arrayio.tdf.SAMPLE_NAME)
    bad_names = []
    for i, name in enumerate(sample_names):
        if not name:
            bad_names.append("<blank>")
        elif re.search("[^a-zA-Z0-9_-]", name):
            bad_names.append(name)
    #assert not bad_names, "Bad sample name: %s" % ", ".join(bad_names)

    # If there are bad names, try to fix them.
    if bad_names:
        X = copy.deepcopy(X)
        sample_names = [hashlib.hash_var(x) for x in sample_names]
        sample_names = AnnotationMatrix.uniquify_headers(sample_names)
        header = X._resolve_synonym(arrayio.tdf.SAMPLE_NAME, X.col_names,
                                    X._synonyms)
        X._col_names[header] = sample_names

    # Make sure sample names are unique.
    seen = {}
    for i, name in enumerate(sample_names):
        assert name not in seen, "Duplicate sample name: %s" % name
        seen[name] = 1

    return X
Exemplo n.º 9
0
def _uniquify_samples_in_vcf(filename, to_append):
    from genomicode import hashlib

    lines, header_i, samples = _read_vcf(filename)

    header = lines[header_i]
    header1 = header[:-len(samples)]
    header2 = header[-len(samples):]

    x = header2
    x = ["%s %s" % (x, to_append) for x in x]
    x = [hashlib.hash_var(x) for x in x]
    header2 = x
    lines[header_i] = header1 + header2

    handle = open(filename, 'w')
    for x in lines:
        print >> handle, "\t".join(x)
    handle.close()
Exemplo n.º 10
0
def _make_filename(M, gene_i, filestem, analysis, gene_headers, filetype,
                   fileext):
    # gene_i is the index of the gene in the matrix.
    # If filestem is None, will not use a filestem.
    # gene_headers is a list of headers from Matrix.  If empty, will
    # try to provide one.

    # Format:
    # <filestem>.<analysis>.<gene_name>.<filetype>.<fileext>
    #
    # <filestem>    BRCA  (has no "." at end)
    # <analysis>    SUBTYPE,ER,OS
    # <gene_name>   GAPDH
    # <filetype>    boxplot, prism, waterfall
    # <fileext>     txt, png
    from genomicode import hashlib

    assert type(analysis) is type("") and analysis
    assert type(filetype) is type("") and filetype
    assert type(fileext) is type("") and fileext
    for h in gene_headers:
        assert h in M.row_names()

    # Figure out the gene_name.
    x = format_gene_name(M, gene_headers, gene_i)
    gene_name = hashlib.hash_var(x)
    #if gene_headers:
    #    x = [M.row_names(x)[gene_i] for x in gene_headers]
    #    gene_name = "_".join(x)
    #else:
    #    x = get_gene_name(M, gene_i)
    #    x = hashlib.hash_var(x)
    #    gene_name = x

    parts = [analysis, gene_name, filetype, fileext]
    if filestem:
        parts.insert(0, filestem)
    filename = ".".join(parts)
    return filename
Exemplo n.º 11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        fragment_length = module_utils.get_user_option(
            user_options, "peakseq_fragment_length", not_empty=True, type=int)
        mappability_file = module_utils.get_user_option(user_options,
                                                        "mappability_file",
                                                        not_empty=True,
                                                        check_file=True)
        assert fragment_length > 0 and fragment_length < 1000

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_peakseq_command(treat_filename, control_filename, out_path,
                                   experiment_name, fragment_length,
                                   mappability_file)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "config.dat",
            log_file,
            "%s.txt" % experiment_name,
            # Can be length 0, if no peaks found.
            #"%s_narrowPeak.txt" % experiment_name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
Exemplo n.º 12
0
def main():
    import os
    import argparse

    from genomicode import jmath
    #from genomicode import AnnotationMatrix
    #from genomicode import colorlib
    #from genomicode import pcalib
    from genomicode import hashlib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile",
                        help="Tab-delimited text file in Prism format.  "
                        "Each column is a series.  First row is header.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"
    assert args.scale_points > 0 and args.scale_points < 20

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    # Read the data file.
    # List of (name, values).
    MATRIX = read_prism_file(args.datafile)

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    # Start R and set up the environment.
    R = jmath.start_R()
    R("library(beeswarm)")

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    #lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    R("X <- list()")
    for title, values in MATRIX:
        title_h = hashlib.hash_var(title)
        jmath.R_equals(values, "x")
        R('X[["%s"]] <- x' % title_h)

    keywds = {
        "cex.axis": cex_lab,  # Y-axis
        "cex.names": cex_lab,  # X-axis
    }
    jmath.R_fn(
        "beeswarm",
        jmath.R_var("X"),
        main="",
        xlab="",
        ylab="",
        pch=19,
        cex=cex,
        #axes=jmath.R_var("FALSE"),
        RETVAL="x",
        **keywds)
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    #r = jmath.R("cor(X, Y)")
    #p_value = jmath.R("cor.test(X, Y)$p.value")
    #r = r[0]
    #p_value = p_value[0]
    #print "R = %.2f" % r
    #print "p = %.2g" % p_value

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Exemplo n.º 13
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \
        #   -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA"
        #   -t <gtf_file> >& <log_filename>"
        # <out_path>        Output directory.  Will be created if not exists.
        # <in_filename>     BAM file
        # <reference_file>  /data/biocore/genomes/UCSC/mm10.fa
        # <gtf_file>   /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf
        #
        # <reference_file> must be indexed and have a dict file.

        rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar)

        GTF = module_utils.get_user_option(
            user_options, "rna_seqc_gtf_file", not_empty=True)
        assert os.path.exists(GTF), "File not found: %s" % GTF

        # list of infile, out_path, ref_file, gtf_file, sample, log_file
        jobs = []
        for in_filename in bam_filenames:
            p, file_ = os.path.split(in_filename)
            f, e = os.path.splitext(file_)
            sample = hashlib.hash_var(f)
            out_path_rna_seqc = os.path.join(out_path, sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \
                sample, log_filename
            jobs.append(x)

        sq = parallel.quote
        commands = []
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x

            x = [sample, in_filename, "NA"]
            x = "|".join(x)
            x = [
                'java',
                '-jar', rna_seqc_jar,
                '-o', sq(out_path_rna_seqc),
                '-r', sq(ref_filename),
                '-s', "'%s'" % x,
                '-t', gtf_filename,
                ]
            x = " ".join(x)
            cmd = "%s >& %s" % (x, log_filename)
            commands.append(cmd)

        # Gets lots of errors.

        x = parallel.pshell(commands, max_procs=num_cores)
        run_log = os.path.join(out_path, "run.log")
        open(run_log, 'w').write(x)

        # Check for outfile.
        # Make sure the analysis completed successfully.
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x
            filelib.assert_exists_nz(out_path_rna_seqc)
Exemplo n.º 14
0
def make_cancer_samples_file(vcf_file, nc_match, outfile):
    # Two column tab-delimited text.  No headers.
    # <germline>  <tumor>
    from genomicode import vcflib
    from genomicode import hashlib
    from genomicode import jmath

    # vcf samples (joined with bcftools).
    # PIM005_G   peak1   2:PIM001_G      peak2   3:PIM001_G   [...]

    germline_samples = [x[0] for x in nc_match]
    tumor_samples = [x[1] for x in nc_match]

    # Hopefully should be able to find the samples in the first 1000
    # rows.
    vcf = vcflib.read(vcf_file, nrows=1000)

    # Get the samples from the VCF file.
    samples = vcf.samples

    # HACK: Fix some problems with old files.
    #samples = [x.replace("Cap475-5983-19", "PIM001_G") for x in samples]

    # HACK: Radia has calls from RNA.  Ignore them.
    # <tumor_sample>_RNA
    rna = {}.fromkeys(["%s_RNA" % x for x in tumor_samples])
    samples = [x for x in samples if x not in rna]

    # Samples may be hashed, e.g.
    # 196B-MG -> X196B_MG
    # Need to compare against hashed samples.
    germline_samples_h = [hashlib.hash_var(x) for x in germline_samples]
    tumor_samples_h = [hashlib.hash_var(x) for x in tumor_samples]
    # Make sure hashing does not make duplicate tumor samples.
    # Germline may be duplicated.
    #assert not _dups(germline_samples)
    assert not _dups(tumor_samples)
    #assert not _dups(germline_samples_h)
    assert not _dups(tumor_samples_h)

    # Clean up samples.
    clean = []  # list of tuples ("G" or "T", sample_name)
    for sample in samples:
        if sample in germline_samples:
            x = "G", sample
        elif sample in germline_samples_h:
            # Don't unhash it.  Otherwise, snpeff will be confused.
            #i = germline_samples_h[sample]
            #x = "G", germline_samples[i]
            x = "G", sample
        elif sample in tumor_samples:
            x = "T", sample
        elif sample in tumor_samples_h:
            #i = tumor_samples_h[sample]
            #x = "T", tumor_samples[i]
            x = "T", sample
        else:
            # <num>:<germline sample name>
            x = sample.split(":", 1)
            assert len(x) == 2, "Unknown sample name (%s) in: %s" % (sample,
                                                                     vcf_file)
            assert jmath.is_int(
                x[0]), "Unknown sample name (%s) in: %s" % (sample, vcf_file)
            s = x[1]
            if s in germline_samples:
                x = "G", s
            elif s in germline_samples_h:
                #i = germline_samples_h[s]
                #x = "G", germline_samples[i]
                x = "G", s
            else:
                raise AssertionError, "Unknown sample name: %s" % sample
        clean.append(x)
    samples = clean

    # If there are no germline samples, then don't make a file.
    x1 = [x for x in samples if x[0] == "G"]
    x2 = [x for x in samples if x[0] == "T"]
    if not x1:
        return None
    # Make sure there are the same number of germline samples.
    assert len(x1) == len(x2), "Germline/Tumor mismatch: %s" % vcf_file
    assert len(samples) % 2 == 0

    # Pairs should contain one "G" and one "T".
    for i in range(0, len(samples), 2):
        t1, s1 = samples[i]
        t2, s2 = samples[i + 1]
        assert t1 != t2, "Bad Germline/Tumor ordering: %s" % vcf_file

    lines = []
    for i in range(0, len(samples), 2):
        t1, s1 = samples[i]
        t2, s2 = samples[i + 1]
        # Want germline, then tumor.
        if t1 == "T" and t2 == "G":
            t1, s1, t2, s2 = t2, s2, t1, s1
        assert t1 == "G" and t2 == "T"
        x = "%s\t%s\n" % (s1, s2)
        lines.append(x)
    open(outfile, 'w').writelines(lines)
Exemplo n.º 15
0
def calc_gsea(expression_file, class_label_file, user_options, num_cores,
              out_path, permutation_type, database):
    import os
    import arrayio
    from genomicode import parallel
    from genomicode import arraysetlib
    from genomicode import hashlib
    from genomicode import filelib
    from genomicode import genesetlib
    from Betsy import module_utils as mlib

    names, classes = arraysetlib.read_cls_file(class_label_file)
    assert names
    assert len(names) >= 2, ("At least 2 classes needed for GSEA analysis.  "
                             "Found only: %s" % (names[0]))
    # Make sure there are the same number of samples in the class
    # label file as in the gene expression file.
    MATRIX = arrayio.read(expression_file)
    assert MATRIX.ncol() == len(classes), (
        "Mismatch: expression (%d) classes (%d)" %
        (MATRIX.ncol(), len(classes)))
    # Make sure classes go from [0, len(names))
    for i in classes:
        assert i >= 0 and i < len(names)

    fdr_cutoff = mlib.get_user_option(user_options,
                                      "gsea_fdr_cutoff",
                                      not_empty=True,
                                      type=float)
    assert fdr_cutoff > 0 and fdr_cutoff <= 1

    # Find all combinations of names and classes.
    opj = os.path.join
    jobs = []
    for i1 in range(len(names) - 1):
        for i2 in range(i1 + 1, len(names)):
            N1 = names[i1]
            N2 = names[i2]
            # Indexes should be 1-based.
            I1 = [i + 1 for i in range(len(classes)) if classes[i] == i1]
            I2 = [i + 1 for i in range(len(classes)) if classes[i] == i2]
            N1_h = hashlib.hash_var(N1)
            N2_h = hashlib.hash_var(N2)
            stem = "%s.vs.%s" % (N1_h, N2_h)

            gsea_path = opj(out_path, "%s.%s.gsea" % (stem, database))

            x = filelib.GenericObject(N1=N1,
                                      N2=N2,
                                      I1=I1,
                                      I2=I2,
                                      stem=stem,
                                      gsea_path=gsea_path)
            jobs.append(x)

    permutation_types = {}
    commands = []
    for j in jobs:
        # Need at least 3 samples for "phenotype" permutations.  If
        # there are fewer samples, then set to "gene_set".
        if len(I1) < 3 or len(I2) < 3:
            permutation_type = "gene_set"
        permutation_types[permutation_type] = 1
        cmd = make_gsea_command(expression_file, class_label_file, j.gsea_path,
                                j.N1, j.N2, j.I1, j.I2, permutation_type,
                                database)
        commands.append(cmd)
    for cmd in commands:
        parallel.sshell(cmd)

    # Summarize results.
    # Make a geneset file.
    significant = []
    for j in jobs:
        x = find_significant_gene_sets(j.gsea_path, j.N1, j.N2, fdr_cutoff)
        significant.append(x)

    genesets = []
    for j, x in zip(jobs, significant):
        genes1, genes2 = x
        gs_name1 = "%s_%s" % (j.stem, j.N1)
        gs_name2 = "%s_%s" % (j.stem, j.N2)
        gs1 = genesetlib.GeneSet(gs_name1, "", genes1)
        gs2 = genesetlib.GeneSet(gs_name2, "", genes2)
        genesets.extend([gs1, gs2])
    x = "genesets.fdr_%g.gmt" % fdr_cutoff
    geneset_file = opj(out_path, x)
    genesetlib.write_gmt(geneset_file, genesets)

    # Count the number of significant gene sets.
    x = "num_genesets.fdr_%g.txt" % fdr_cutoff
    summary_file = opj(out_path, x)
    handle = open(summary_file, 'w')
    header = "Group 1", "Group 2", "Gene Sets in Group 1", \
             "Gene Sets in Group 2"
    print >> handle, "\t".join(header)
    for j, x in zip(jobs, significant):
        genes1, genes2 = x
        x = j.N1, j.N2, len(genes1), len(genes2)
        assert len(x) == len(header)
        print >> handle, "\t".join(map(str, x))
    handle.close()

    return commands, sorted(permutation_types)
Exemplo n.º 16
0
def merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path):
    # Put indexed files in tmp_path.
    import os
    import stat
    import shutil
    from genomicode import filelib
    from genomicode import hashlib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    # TODO: find the version number of these tools.
    bgzip = mlib.findbin("bgzip")
    tabix = mlib.findbin("tabix")
    bcftools = mlib.findbin("bcftools")
    sq = parallel.quote

    tmp_path = os.path.realpath(tmp_path)
    filelib.safe_mkdir(tmp_path)

    # Keep track of all commands run.
    metadata = {}
    metadata["commands"] = []

    # Ignore VCF files that don't have any variants.
    vcf_filenames = [x for x in vcf_filenames if os.stat(x)[stat.ST_SIZE] > 0]

    # If there are no VCF files with any variants, then just create an
    # empty outfile and return.
    if not vcf_filenames:
        open(out_filename, 'w')
        return

    # 1.  Copy VCF files to temporary directory.             tmp_filename
    # 2.  Fix VCF files (e.g. NextGENe, JointSNVMix broken)
    # 3.  Sort the VCF files (needed for tabix)
    # 4.  Compress  (bgzip)
    # 5.  Index     (tabix)
    # 6.  Merge

    jobs = []
    for in_filename in vcf_filenames:
        path, root, ext = mlib.splitpath(in_filename)
        sample = root
        x = "%s%s" % (hashlib.hash_var(root), ext)
        tmp_filename = os.path.join(tmp_path, x)
        x = filelib.GenericObject(
            sample=sample,
            in_filename=in_filename,
            tmp_filename=tmp_filename,
        )
        jobs.append(x)

    # Make sure temporary files are unique.
    seen = {}
    for j in jobs:
        assert j.tmp_filename not in seen
        seen[j.tmp_filename] = 1

    # Merge them in order of sample.  The germline sample will be
    # duplicated, and we will know the order of the germline sample.
    schwartz = [(x.sample, x) for x in jobs]
    schwartz.sort()
    jobs = [x[-1] for x in schwartz]

    # Copy all the VCF files to a temporary directory.
    for j in jobs:
        shutil.copy2(j.in_filename, j.tmp_filename)

    #for j in jobs:
    #    make_file_smaller(j.tmp_filename, 1000)

    for j in jobs:
        # NextGENe creates broken VCF files.  Fix them.
        fix_nextgene_vcf(j.tmp_filename)
        # JointSNVMix creates broken VCF files.  Fix them.
        fix_jointsnvmix_vcf(j.tmp_filename)

    for j in jobs:
        sort_vcf_file(j.tmp_filename)

    ## # Since we are merging the files, we need to make sure that
    ## # each file has a unique name.  If the names aren't unique,
    ## # then make them unique by adding the name of the file.
    ## all_unique = True
    ## seen = {}
    ## for x in jobs:
    ##     sample, in_filename, tmp_filename = x
    ##     samples = _get_samples_from_vcf(tmp_filename)
    ##     for s in samples:
    ##         if s in seen:
    ##             all_unique = False
    ##             break
    ##         seen[s] = 1
    ##     if not all_unique:
    ##         break
    ## if not all_unique:
    ##     for x in jobs:
    ##         sample, in_filename, tmp_filename = x
    ##         _uniquify_samples_in_vcf(tmp_filename, sample)

    # Compress the VCF files.
    # bgzip file.vcf
    commands = []
    for j in jobs:
        x = "%s %s" % (sq(bgzip), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    metadata["num_cores"] = num_cores
    x = ["%s.gz" % x.tmp_filename for x in jobs]
    filelib.assert_exists_nz_many(x)

    # Index the VCF files.
    # tabix -p vcf file.vcf.gz
    commands = []
    for j in jobs:
        x = "%s -p vcf %s.gz" % (sq(tabix), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    x = ["%s.gz.tbi" % j.tmp_filename for j in jobs]
    filelib.assert_exists_nz_many(x)

    # Run bcftools
    ## For VCF files from somatic calls, the germline sample will
    ## be duplicated.  Add --force-samples to make sure this is
    ## still merged.

    # Since we need to append all the VCF files, it's easy to run
    # into error:
    # OSError: [Errno 7] Argument list too long
    #
    # To reduce the chance of this, figure out the path of the
    # tmp_filename, and run the analysis in that path so we can
    # use relative filenames.
    tmp_path = None
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        if tmp_path is None:
            tmp_path = path
        assert path == tmp_path

    cmd = [
        sq(bcftools),
        "merge",
        "-o %s" % sq(out_filename),
        "-O v",
        "--force-samples",
    ]
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        assert path == tmp_path
        cmd.append("%s.gz" % file_)
    x = " ".join(cmd)
    parallel.sshell(x, path=tmp_path)
    metadata["commands"].append(x)

    return metadata
Exemplo n.º 17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        tag_node, group_node = antecedents
        tag_path = module_utils.check_inpath(tag_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")

        # Set the experiment name.
        experiment_name = treat_sample
        if control_sample:
            name1 = hashlib.hash_var(treat_sample)
            name2 = hashlib.hash_var(control_sample)
            experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the tag directories.
        treat_path = os.path.join(tag_path, treat_sample)
        assert os.path.exists(treat_path)
        if control_sample:
            control_path = os.path.join(tag_path, control_sample)
            assert os.path.exists(control_path)

        # Get the command.
        homer_path = filelib.which_assert(config.homer_path)
        x = os.path.join(homer_path, "bin", "findPeaks")
        assert filelib.exists_nz(x)
        find_peaks = x

        log_file = "%s.log" % experiment_name
        peak_file = "%s.peaks.txt" % experiment_name

        sq = parallel.quote
        cmd = [
            sq(find_peaks),
            sq(treat_path),
            "-style",
            "factor",
        ]
        if control_sample:
            cmd += ["-i", control_path]
        cmd = " ".join(cmd)
        cmd = "%s 2> %s 1> %s" % (cmd, log_file, peak_file)
        parallel.sshell(cmd, path=out_path)

        x = os.path.join(out_path, peak_file)
        filelib.assert_exists_nz(x)
Exemplo n.º 18
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        x = module_utils.get_user_option(user_options,
                                         "broad_peaks",
                                         allowed_values=["no", "yes"])
        broad_peaks = (x == "yes")
        x = module_utils.get_user_option(user_options,
                                         "macs_paired",
                                         allowed_values=["no", "yes"])
        is_paired = (x == "yes")

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = run_MACS14.find_bam_file(
                bam_path, control_sample, sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs2_command(treat_filename,
                                 control_filename=control_filename,
                                 genome_size=genome_size,
                                 save_bedgraph_file=True,
                                 name=name,
                                 normalize_read_counts=True,
                                 paired=is_paired,
                                 broad_peak_calling=broad_peaks)
        parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
Exemplo n.º 19
0
def main():
    from optparse import OptionParser, OptionGroup
    
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option(
        "-r", "--rma", dest="rma_dataset", type="string", default=None,
        help="Specify the RMA-normalized data to analyze.")
    parser.add_option(
        "-m", "--mas5", dest="mas5_dataset", type="string", default=None,
        help="Specify the MAS5-normalized data to analyze.")
    parser.add_option(
        "-i", "--illu", dest="illu_dataset", type="string", default=None,
        help="Specify the Illumina data to analyze.")
    parser.add_option(
        "", "--sigdb_path", dest="sigdb_path", type="string", default=None,
        help="Location of the sigdb/ directory.")
    parser.add_option(
        "", "--sigtag", dest="signature_tags", default=[], action="append",
        help="Specify a specific tag to use.")
    parser.add_option(
        "", "--sigid", dest="signature_ids", default=[], action="append",
        help="Specify a specific signature to use.")
    parser.add_option(
        "", "--max_signatures", dest="max_signatures", type="int",
        default=None,
        help="Maximum number of signatures to run (for DEBUGGING).")
    parser.add_option(
        "-j", "", dest="num_procs", type="int", default=1,
        help="Number of jobs to run in parallel.")
    parser.add_option(
        "-z", "", dest="archive", action="store_true", default=False,
        help="Archive the individual signatures.  Helpful for GenePattern.")
    parser.add_option(
        "", "--libpath", dest="libpath", action="append", default=[],
        help="Add to the Python library search path.")
    parser.add_option(
        "-o", "--outpath", dest="outpath", type="string", default=None,
        help="Save files in this path.")
    parser.add_option(
        "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string",
        default=None,
        help="Special internal variable for use with GenePattern "
        "interactive modules.")
    parser.add_option(
        "", "--debug_gp_imod_all_vars", action="store_true", default=False, 
        dest="debug_gp_imod_all_vars",
        )
    
    #group = OptionGroup(parser, "Normalization")
    #group.add_option(
    #    "", "--normalization", dest="normalization", default="MAS5",
    #    help="How was the data set normalized (default MAS5).")
    #group.add_option(
    #    "-l", "--log_data", dest="log_data", action="store_true",
    #    default=False,
    #    help="Log the MAS5 data before analyzing.")
    #parser.add_option_group(group)

    group = OptionGroup(parser, "Pybinreg")
    group.add_option(
        "", "--python", dest="python", default=None,
        help="Specify the command to run python.")
    group.add_option(
        "", "--matlab", dest="matlab", default=None,
        help="Specify the command to run matlab.")
    group.add_option(
        "", "--povray", dest="povray", default=None,
        help="Specify the command to run povray.")
    group.add_option(
        "", "--cluster", dest="cluster", default=None,
        help="Specify the command to run cluster.")
    group.add_option(
        "", "--binreg", dest="binreg_path", default=None,
        help="Specify the path to the BinReg2.0 code.")
    group.add_option(
        "", "--pybinreg", dest="pybinreg", default=None,
        help="Specify the command to run pybinreg.py.")
    group.add_option(
        "", "--arrayplot", dest="arrayplot", default=None,
        help="Specify the command to run arrayplot.")
    parser.add_option_group(group)

    options, args = parser.parse_args()
    #if len(args) < 1:
    #    #print sys.argv
    #    #print len(args), args
    #    parser.error("Please specify sigdb_path.")
    #elif len(args) > 1:
    #    parser.error("Too many arguments.")
    if args:
        parser.error("Too many arguments.")

    # DEBUG the gp_imod_all_vars variable.
    if options.debug_gp_imod_all_vars:
        assert not options.gp_imod_all_vars
        options.gp_imod_all_vars = (
            "mas5_expression_file_cb=file&mas5_expression_file_url=&"
            "rma_expression_file_cb=file&rma_expression_file_url=&"
            # Skip AKT signature.
            "sig_AKT=no&"
            # Change BCAT normalization.
            "sig_BCAT=yes (custom parameters)&"
            "sig_BCAT_apply_quantile_normalization=no&"
            "sig_BCAT_apply_shiftscale_normalization=no&"
            "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&"
            # No changes in E2F1.
            "sig_E2F1=yes (custom parameters)&"
            "sig_E2F1_apply_quantile_normalization=yes&"
            "sig_E2F1_apply_shiftscale_normalization=yes&"
            "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&"
            # Change genes in EGFR.
            "sig_EGFR=yes (custom parameters)&"
            "sig_EGFR_apply_quantile_normalization=no&"
            "sig_EGFR_apply_shiftscale_normalization=yes&"
            #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&"
            "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&"
            # Change quantile, genes, metagenes in ER.
            "sig_ER=yes (custom parameters)&"
            "sig_ER_apply_quantile_normalization=no&"
            "sig_ER_apply_shiftscale_normalization=yes&"
            "sig_ER_num_genes=150&sig_ER_num_metagenes=3&"
            "sig_HER2=yes (default parameters)&"
            "sig_IFNalpha=yes (default parameters)&"
            "sig_IFNgamma=yes (default parameters)&"
            "sig_MYC=yes (default parameters)&"
            "sig_P53=yes (default parameters)&"
            "sig_P63=yes (default parameters)&"
            "sig_PI3K=yes (default parameters)&"
            "sig_PR=yes (default parameters)&"
            "sig_RAS=yes (default parameters)&"
            "sig_SRC=yes (default parameters)&"
            "sig_STAT3=yes (default parameters)&"
            "sig_TGFB=yes (default parameters)&"
            "sig_TNFa=yes (default parameters)&"
            "which_signatures=I choose myself"
            )
        
    datafile_rma = datafile_mas5 = datafile_illu = None
    if options.rma_dataset is not None:
        assert os.path.exists(options.rma_dataset), \
               "RMA file not found: %s" % options.rma_dataset
        datafile_rma = os.path.realpath(options.rma_dataset)
    if options.mas5_dataset is not None:
        assert os.path.exists(options.mas5_dataset), \
               "MAS5 file not found: %s" % options.mas5_dataset
        datafile_mas5 = os.path.realpath(options.mas5_dataset)
    if options.illu_dataset is not None:
        assert os.path.exists(options.illu_dataset), \
               "ILLU file not found: %s" % options.illu_dataset
        datafile_illu = os.path.realpath(options.illu_dataset)
    assert datafile_rma or datafile_mas5 or datafile_illu, \
           "Please specify at least one data set."

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import time
    import arrayio
    from genomicode import config
    from genomicode import parallel
    from genomicode import archive
    from genomicode import hashlib
    from genomicode import matrixlib
    from genomicode import genepattern
    
    #sigdb_path, = args
    x = options.sigdb_path or config.sigdb_path
    sigdb_path = os.path.realpath(x)
    assert os.path.exists(sigdb_path), \
           "I could not find the signatures database: %s." % sigdb_path

    start_time = time.time()
    
    genepattern.fix_environ_path()
    
    file_layout = make_file_layout(options.outpath)
    init_paths(file_layout)

    # Read the signatures and select the ones to score.
    # BUG: Should allow this to be specified on the command line.
    desired_tags = ["Pathway"]  # default
    if options.signature_tags:
        desired_tags = options.signature_tags[:]
    all_normalization = ["RMA", "MAS5", "ILLU"]
    desired_normalization = []
    if datafile_rma is not None:   # RMA datafile is specified.
        desired_normalization.append("RMA")
    if datafile_mas5 is not None:  # MAS5 datafile is specified.
        desired_normalization.append("MAS5")
    if datafile_illu is not None:  # ILLU datafile is specified.
        desired_normalization.append("ILLU")
        
    # If any signature IDs are specified, then use only those IDs and
    # ignore the desired tags.
    print "Reading signature database: %s." % sigdb_path
    desired_ids = []
    if options.signature_ids:
        desired_ids = options.signature_ids[:]
    x = read_signatures(
        sigdb_path, all_normalization, desired_ids, desired_tags)
    signatures = x
    orig_signatures = signatures[:]

    # Filter for just the normalization that we have data files for.
    # Keep track of why we filtered out certain signatures.
    why_dropped = {}  # ID -> explanation as string
    good = []
    for sig in signatures:
        if sig.Normalization.upper() in desired_normalization:
            good.append(sig)
            continue
        x = "Signature requires %s normalized data, but it was not provided."%(
            sig.Normalization.upper())
        why_dropped[sig.xID] = x
    signatures = good
    assert signatures, "No signatures available."

    # Process additional parameters from GenePattern.
    # o Do this before max_signatures, so that the maximum signatures
    #   is selected only out of the ones that the user specified.
    # o Do this before names and paths, so the variables will be
    #   aligned.
    # gp_imod_all_vars can be None or "".
    if options.gp_imod_all_vars:
        x = process_gp_imod_all_vars(
            options.gp_imod_all_vars, signatures, why_dropped)
        signatures, why_dropped = x

    sys.stdout.flush()
    DATA_rma = DATA_mas5 = DATA_illu = None
    if datafile_rma is not None:
        print "Reading RMA file: %s" % datafile_rma
        DATA_rma = arrayio.read(datafile_rma)
        DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format)
    if datafile_mas5 is not None:
        print "Reading MAS5 file: %s" % datafile_mas5
        DATA_mas5 = arrayio.read(datafile_mas5)
        DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format)
    if datafile_illu is not None:
        print "Reading ILLU file: %s" % datafile_illu
        DATA_illu = arrayio.read(datafile_illu)
        DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format)
    # Don't handle the log.  Let pybinreg do it.
    # Make sure the data sets contain the same samples.  Align them if
    # necessary.
    DATA_all = [
        ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5),
        ("DATA_illu", DATA_illu)]
    DATA_all = [x for x in DATA_all if x[1]]
    for i in range(1, len(DATA_all)):
        key1, data1 = DATA_all[0]
        key2, data2 = DATA_all[i]
        assert key1 != key2
        assert data1 and data2
        assert data1.ncol() == data2.ncol(), \
               "%s and %s data sets have different numbers of samples." % (
            key1, key2)
        if matrixlib.are_cols_aligned(data1, data2):
            continue
        x = matrixlib.align_cols(data1, data2)
        data1_new, data2_new = x
        assert matrixlib.are_cols_aligned(data1_new, data2_new)
        # The samples in data1 (the reference) should not be changed.
        assert data1.ncol() == data1_new.ncol(), \
               "%s and %s data sets have different samples" % (
            key1, key2)
        assert matrixlib.are_cols_aligned(data1, data1_new)
        DATA_all[i] = key2, data2_new
    for key, data in DATA_all:
        if key == "DATA_rma":
            DATA_rma = data
        elif key == "DATA_mas5":
            DATA_mas5 = data
        elif key == "DATA_illu":
            DATA_illu = data
        else:
            raise AssertionError, "Unknown key: %s" % key
    print "Writing aligned signal files."
    if DATA_rma:
        arrayio.gct_format.write(
            DATA_rma, open(file_layout.DATASET_RMA, 'w'))
    if DATA_mas5:
        arrayio.gct_format.write(
            DATA_mas5, open(file_layout.DATASET_MAS5, 'w'))
    if DATA_illu:
        arrayio.gct_format.write(
            DATA_illu, open(file_layout.DATASET_ILLU, 'w'))

    # Figure out the names and paths for each signature.
    print "Finding signatures."
    names = [None] * len(signatures)   # SIG19_AKT[_modified]
    paths = [None] * len(signatures)   # <path>/SIG19_AKT[_modified]
    for i, sig in enumerate(signatures):
        name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name))
        # If the user has modified the signature from the default
        # parameters, then make a note of it.
        if getattr(sig, "Changed", False):
            name = "%s_modified" % name
        outpath = os.path.join(file_layout.OUTPATH, name)
        names[i] = name
        paths[i] = outpath

    if options.max_signatures is not None:
        signatures = signatures[:options.max_signatures]

    # Make a list of the jobs.
    jobs = []  # list of cmd, outpath, outfile
    for i, sig in enumerate(signatures):
        name, outpath = names[i], paths[i]
        #print "Generating signature %s [%d:%d]" % (
        #    name, i+1, len(signatures))
        #sys.stdout.flush()
        
        quantile_normalize = False
        assert sig.Quantile.upper() in ["YES", "NO"]
        if sig.Quantile.upper() == "YES":
            quantile_normalize = True
        shift_scale_normalize = False
        assert sig.Shift_Scale.upper() in ["YES", "NO"]
        if sig.Shift_Scale.upper() == "YES":
            shift_scale_normalize = True
        
        #outfile = os.path.join(files.outpath, "%s.out.txt" % name)
        outfile = os.path.join(outpath, "out.txt")

        if sig.Normalization.upper() == "RMA":
            datafile = file_layout.DATASET_RMA
            assert DATA_rma
        elif sig.Normalization.upper() == "MAS5":
            datafile = file_layout.DATASET_MAS5
            assert DATA_mas5
        elif sig.Normalization.upper() == "ILLU":
            datafile = file_layout.DATASET_ILLU
            assert DATA_illu
        else:
            raise AssertionError, "Unknown normalization."

        # If the entire analysis should be archived, then go ahead and
        # archive each of the pybinreg runs too.  This will prevent
        # large analyses from taking up too much disk space.  The
        # drawback is that the files that are archived are no longer
        # available for use here.  Hopefully this won't be a problem.
        cmd = make_pybinreg_cmd(
            options.pybinreg, options.python, options.binreg_path,
            options.matlab, options.arrayplot, options.povray,
            options.cluster, options.libpath,
            outpath, options.archive, sig.Genes, sig.Metagenes,
            quantile_normalize, shift_scale_normalize,
            sig.Train0, sig.Train1, datafile)
        x = cmd, outpath, outfile
        jobs.append(x)

    # Run each of the jobs.
    if options.num_procs < 1 or options.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    if options.num_procs > 1:
        if parallel._find_parallel():
            num_sigs = min(options.num_procs, len(jobs))
            if num_sigs > 1:
                print "Predicting %d signatures at a time." % num_sigs
        else:
            print("I could not find GNU parallel.  "
                  "Predicting 1 signature at a time.")
            options.num_procs = 1
        sys.stdout.flush()

    DEBUG = False   # Can disable pybinreg temporarily for debugging.
    if not DEBUG:  
        if options.num_procs <= 1:
            for x in jobs:
                cmd, outpath, outfile = x
                run_one_pybinreg(cmd, outpath, outfile)
        else:
            run_many_pybinreg(jobs, options.num_procs)

    if signatures:
        print "Extracting the reports from each signature."
        report_files = extract_reports(names, paths, file_layout)
        
        print "Combining probabilities from each of the signatures."
        summarize_probabilities(signatures, names, paths, file_layout)

        print "Making heatmap of the results."
        sys.stdout.flush()
        summarize_heatmap(
            options.python, options.arrayplot, options.cluster,
            options.libpath, file_layout)

        print "Summarizing signatures."
        summarize_signatures(signatures, file_layout)

        print "Making a report."
        analysis_name = make_analysis_name(options)
        summarize_report(
            analysis_name, signatures, orig_signatures, report_files,
            start_time, why_dropped, file_layout)

    if options.archive:
        print "Compressing results."
        sys.stdout.flush()
        archive.zip_path(file_layout.ATTIC)
        for i, sig in enumerate(signatures):
            name, outpath = names[i], paths[i]
            archive.zip_path(outpath)
    
    print "Done."
Exemplo n.º 20
0
def summarize_vcf_file(filename, filestem, header, outfilename, lock):
    from genomicode import hashlib
    from genomicode import vcflib

    vcf = vcflib.read(filename)

    lines = []
    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)

        caller_name = var.caller.name
        ref = var.ref
        alt = ",".join(var.alt)
        filter_str = vcf.caller.get_filter(var)

        for sample in var.samples:
            # If sample begins with an integer, there may be a
            # "X" pre-pended to it.  Try to detect this case
            # and fix it.
            clean_sample = sample
            if sample == hashlib.hash_var(filestem):
                clean_sample = filestem

            source = "DNA"
            if caller_name == "Radia":
                # DNA    <clean_sample>       196B-lung
                # RNA    <clean_sample>_RNA   196B-lung_RNA
                # Figure out whether this is RNA and fix it.
                if clean_sample.endswith("_RNA"):
                    clean_sample = clean_sample[:-4]
                    source = "RNA"

            genodict = var.sample2genodict[sample]
            call = vcflib.get_call(var, sample)

            num_ref = vcflib._format_vcf_value(call.num_ref, None_char="")
            num_alt = vcflib._format_vcf_value(call.num_alt, None_char="")
            total_reads = vcflib._format_vcf_value(call.total_reads,
                                                   None_char="")
            vaf = vcflib._format_vcf_value(call.vaf, None_char="")
            call_str = vcflib._format_vcf_value(call.call, None_char="")
            GQ = genodict.get("GQ", "")
            if GQ in [None, "."]:
                GQ = ""

            x = caller_name, filestem, clean_sample, var.chrom, var.pos, \
                ref, alt, source, \
                num_ref, num_alt, total_reads, vaf, filter_str, call_str, GQ
            assert len(x) == len(header)
            x = "\t".join(map(str, x))
            lines.append(x)

            if len(lines) >= 100000:
                x = "\n".join(lines) + "\n"
                lock.acquire()
                handle = open(outfilename, 'a')
                handle.write(x)
                handle.close()
                lock.release()
                lines = []

    x = "\n".join(lines) + "\n"
    lock.acquire()
    handle = open(outfilename, 'a')
    handle.write(x)
    handle.close()
    lock.release()
Exemplo n.º 21
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils
        
        bam_filenames = module_utils.find_bam_files(in_data.identifier)
        assert bam_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)
        metadata = {}
        
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            sample = hashlib.hash_var(s)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(
                in_filename=in_filename,
                sample=sample,
                log_filename=log_filename,
                out_filename=out_filename)
            jobs.append(x)
        
        gid = "group1"
        library = "library"
        platform_unit = "platform"
        #sample = "sample"
        platform = "illumina"

        # java -Xmx5g -jar AddOrReplaceReadGroups.jar
        #   I=<input.sam or .bam> O=<output.bam> ID=<group ID>
        #   LB=<group library> PU=<platform unit> SM=<group sample name>
        #   PL=<platform> CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT
        picard_jar = alignlib.find_picard_jar("picard")

        # Make a list of commands.
        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                "java", "-Xmx5g",
                "-jar", sq(picard_jar),
                "AddOrReplaceReadGroups", 
                "I=%s" % sq(j.in_filename),
                "O=%s" % sq(j.out_filename),
                "ID=%s" % gid,
                "LB=%s" % library,
                "PU=%s" % platform_unit,
                "SM=%s" % j.sample,
                "PL=%s" % platform,
                #"CREATE_INDEX=true",
                "VALIDATION_STRINGENCY=LENIENT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
            
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        # Make sure the analysis completed successfully.
        # Make sure outfiles exist.
        out_filenames = [j.out_filename for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)

        # Check the log files to make sure there are no error.
        for j in jobs:
            check_log_file(j.log_filename)

        return metadata
Exemplo n.º 22
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Exemplo n.º 23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, orient_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Bowtie2 doesn't handle files with spaces in them.  Make
        # temporary files without spaces.

        # Make a list of the jobs to run.
        jobs = []
        for i, x in enumerate(fastq_files):
            sample, pair1, pair2 = x
            bam_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            sample_h = hashlib.hash_var(sample)
            temp_pair1 = "%d_%s_1.fa" % (i, sample_h)
            temp_pair2 = None
            if pair2:
                temp_pair2 = "%d_%s_2.fa" % (i, sample_h)
            j = filelib.GenericObject(sample=sample,
                                      pair1=pair1,
                                      pair2=pair2,
                                      temp_pair1=temp_pair1,
                                      temp_pair2=temp_pair2,
                                      bam_filename=bam_filename,
                                      log_filename=log_filename)
            jobs.append(j)

        for j in jobs:
            os.symlink(j.pair1, j.temp_pair1)
            if pair2:
                os.symlink(j.pair2, j.temp_pair2)

        # Generate bowtie2 commands for each of the files.
        attr2orient = {
            "single": None,
            "paired_fr": "fr",
            "paired_rf": "rf",
            "paired_ff": "ff",
        }
        orientation = attr2orient[orient.orientation]
        #x = sample_node.data.attributes["orientation"]
        #orientation = attr2orient[x]

        # Takes ~4 Gb per job.
        samtools = mlib.findbin("samtools")
        sq = parallel.quote
        commands = []
        for j in jobs:
            #sample, pair1, pair2, bam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))

            # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr
            #  2> test.log | samtools view -bS -o test.bam -
            x1 = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                               j.temp_pair1,
                                               fastq_file2=j.temp_pair2,
                                               orientation=orientation,
                                               num_threads=nc)
            x2 = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(j.bam_filename),
                "-",
            ]
            x2 = " ".join(x2)
            x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2)
            #x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x1 = [x.bam_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Exemplo n.º 24
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
Exemplo n.º 25
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import hashlib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_node = in_data
        filelib.assert_exists_nz(simple_node.identifier)

        gene_file = mlib.get_user_option(
            user_options, "cancer_genes_file", not_empty=True, check_file=True)

        # Read the cancer genes file.
        # <Gene ID>  <Gene Symbol>  <Dataset>  ...
        symbol2info = {}  # symbol -> d
        gene_iter = filelib.read_row(gene_file, header=1)
        header = None
        for d in gene_iter:
            assert "Gene Symbol" in d._header
            if header is None:
                header = [
                    x for x in d._header
                    if x not in ["Gene ID", "Gene Symbol"]]
            if not d.Gene_Symbol:
                continue
            symbol2info[d.Gene_Symbol] = d

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier)

        GENE_H = "Annovar______Gene.refGene"
        assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H
        GENES = SVM[GENE_H]

        # Align the matrix to the simple variant matrix.
        gene_headers = header
        gene_annotations = []
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            values = [""] * len(gene_headers)
            genes = gene_str.split(",")
            for gene in genes:
                if gene not in symbol2info:
                    continue
                d = symbol2info[gene]
                for j, h in enumerate(gene_headers):
                    h = hashlib.hash_var(h)
                    assert hasattr(d, h)
                    x = getattr(d, h)
                    assert x in ["", "1"]
                    if x == "1":
                        values[j] = 1
            gene_annotations.append(values)
        # Convert the headers and annotations to SVM format.
        gene_headers = ["Cancer Genes______%s" % x for x in gene_headers]
        gene_annotations = jmath.transpose(gene_annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If COSMIC exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("COSMIC")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + gene_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)