Exemplo n.º 1
0
def find_cluster_files(cluster_path):
    # Return a dictionary of:
    #   "cdt" : cdt_filename
    #   "atr" : atr_filename
    #   "gtr" : gtr_filename
    #   "kag" : kag_filename
    #   "kgg" : kgg_filename
    # Any of these files can be missing.
    import os
    from genomicode import filelib

    filelib.assert_exists(cluster_path)

    opj = os.path.join
    cdt = opj(cluster_path, "signal.cdt")
    atr = opj(cluster_path, "array_tree.atr")
    gtr = opj(cluster_path, "gene_tree.gtr")
    kag = opj(cluster_path, "array_cluster.kag")
    kgg = opj(cluster_path, "gene_cluster.kgg")

    cluster_files = {}
    if filelib.exists_nz(cdt):
        cluster_files["cdt"] = cdt
    if filelib.exists_nz(atr):
        cluster_files["atr"] = atr
    if filelib.exists_nz(gtr):
        cluster_files["gtr"] = gtr
    if filelib.exists_nz(kag):
        cluster_files["kag"] = kag
    if filelib.exists_nz(kgg):
        cluster_files["kgg"] = kgg

    assert "cdt" in cluster_files, "No clustered file."

    return cluster_files
Exemplo n.º 2
0
def get_config(name,
               which_assert_file=False,
               assert_exists=False,
               quote=False):
    from genomicode import filelib
    from genomicode import config

    assert hasattr(config, name), "Not configured for genomicode: %s" % name
    x = getattr(config, name)
    if which_assert_file:
        x = filelib.which_assert(x)
    elif assert_exists:
        filelib.assert_exists(x)
    if quote:
        x = sq(x)
    return x
Exemplo n.º 3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        from genomicode import filelib

        vcf_node = in_data
        # Some callers, like jointsnvmix, will create vcf files for
        # each chromosome.  To avoid picking these up, only accept
        # .vcf files from the top level.
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   toplevel_only=True)
        assert vcf_filenames, "No .vcf files: %s" % vcf_node.identifier
        metadata = {}

        tmp_path = "indexed.vcf"
        m = merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path)
        metadata.update(m)
        filelib.assert_exists(out_filename)  # may be size 0

        return metadata
Exemplo n.º 4
0
def _run_filterRadia_with_restart(cmd, cancer_sample, chrom, logfile):
    # Sometimes samtools crashes in the middle of a run.  Detect this
    # case, and re-run the analysis if needed.
    from genomicode import parallel
    from genomicode import filelib

    num_tries = 0
    while num_tries <= 3:
        num_tries += 1
        parallel.sshell(cmd, ignore_nonzero_exit=True)
        filelib.assert_exists(logfile)
        log = open(logfile).read()
        # Empty logfile means cmd completed successfully.
        if not log.strip():
            break
        # Look for evidence that samtools died.  If this occurs, try again.
        # 06/29/2016 09:57:16 AM  ERROR   The return code of '1' from the
        #   following filter command indicates an error.
        # 06/29/2016 09:57:16 AM  ERROR   Error from /usr/bin/python
        #   /usr/local/radia/scripts/createBlatFile.pyc 196C-lung2
        #   radia2.tmp/196C-lung2_dnaFiltered_chr1.vcf
        #   radia2.tmp/196C-lung2_mpileup_rna_origin_chr1.vcf
        #   -o radia2.tmp/196C-lung2_blatInput_chr1.fa
        #   --allVCFCalls --blatRnaNormalReads --blatRnaTumorReads:
        # <Traceback>
        # [...]
        #   samtoolsCall.kill()
        # [...]
        # OSError: [Errno 3] No such process
        if log.find("samtoolsCall.kill") >= 0 \
               and log.find("No such process") >= 0:
            continue
        # Otherwise, the process failed for some other reason.  Raise
        # an exception.
        raise AssertionError, "Problem filtering: %s %s\n%s" % (cancer_sample,
                                                                chrom, log)
Exemplo n.º 5
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Exemplo n.º 6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        import zipfile
        #import subprocess
        import arrayio
        from genomicode import parallel
        from genomicode import filelib
        from Betsy import module_utils as mlib
        #from genomicode import config

        in_data = antecedents
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        metadata = {}

        module_name = 'IlluminaExpressionFileCreator'
        params = {}
        if zipfile.is_zipfile(in_data.identifier):
            params['idat.zip'] = in_data.identifier
        else:
            # Add ".zip" to the end of the file.
            zipfile_name = os.path.split(in_data.identifier)[-1] + '.zip'
            zip_directory(in_data.identifier, zipfile_name)
            params['idat.zip'] = os.path.join(".", zipfile_name)

        x = user_options.get("illu_manifest", MP.DEFAULT_MANIFEST)
        assert x in MP.ILLU_MANIFEST, "Unknown manifest: %s" % x
        params['manifest'] = x

        x = user_options.get("illu_chip", MP.DEFAULT_CHIP)
        assert x in MP.ILLU_CHIP, "Unknown chip: %s" % x
        params['chip'] = x

        x = user_options.get("illu_bg_mode")
        if x is not None:
            assert x in ['true', 'false'], \
                   'illu_bg_mode should be true or false'
            params['background.subtraction.mode'] = x

        x = user_options.get("illu_coll_mode")
        if x is not None:
            assert x in ['none', 'max', 'median'], \
                   'ill_coll_mode is not correct'
            params['collapse.mode'] = str(x)

        if 'illu_clm' in user_options:
            params['clm'] = str(user_options['illu_clm'])
        if 'illu_custom_chip' in user_options:
            params['chip'] = str(user_options['illu_custom_chip'])
        if 'illu_custom_manifest' in user_options:
            params['custom.manifest'] = str(
                user_options['illu_custom_manifest'])

        gp_module = mlib.get_config("genepattern", which_assert_file=True)
        download_directory = 'illumina_result'
        sq = parallel.quote
        cmd = [
            sq(gp_module),
            module_name,
            '-o',
            sq(download_directory),
        ]
        for key in params.keys():
            x = ['--parameters', key + ':' + params[key]]
            cmd.extend(x)
        cmd = " ".join(cmd)
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        filelib.assert_exists(download_directory)

        result_files = os.listdir(download_directory)
        #assert 'stderr.txt' not in result_files,('gene_pattern get error '+
        #        'The contents of stderr.txt is:'+
        #        file(os.path.join(download_directory,'stderr.txt')).read())

        for result_file in result_files:
            if result_file == 'System.out':
                continue
            # BUG: What if there are duplicate sample names?
            M = arrayio.read(os.path.join(download_directory, result_file))
            a = M._col_names['_SAMPLE_NAME']
            b = sorted(a)
            index = []
            for i in b:
                index.append(a.index(i))
            M_new = M.matrix(None, index)

            result_path = os.path.join(out_path, result_file)
            f = file(result_path, 'w')
            arrayio.gct_format.write(M_new, f)
            f.close()

        return metadata
Exemplo n.º 7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        vcf_node, nc_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Filenames:
        # <caller>.vcf

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        genome = mlib.get_user_option(user_options,
                                      "snpeff_genome",
                                      not_empty=True)
        databases = list_snpeff_databases()
        assert genome in databases, "Unknown genome database: %s" % genome

        # For each caller, do the SnpEFF calls.  Some callers include
        # the somatic information, others do not.  If germline samples
        # are present, then do with _cancer.  Otherwise, do not.

        # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt
        #   GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log

        # Don't bother annotating positions that do not pass filter.
        # Filter them out first based on FILTER column.

        opj = os.path.join
        jobs = []
        for in_filename in vcf_filenames:
            path, stem, ext = mlib.splitpath(in_filename)
            samples_file = opj(out_path, "%s.cancerSamples.txt" % stem)
            filtered_filename = opj(out_path, "%s.filtered_input" % stem)
            out_filename = opj(out_path, "%s.vcf" % stem)
            log_filename = opj(out_path, "%s.log" % stem)
            x = filelib.GenericObject(in_filename=in_filename,
                                      samples_file=samples_file,
                                      filtered_filename=filtered_filename,
                                      out_filename=out_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # First, filter each of the VCF files.
        commands = []
        for j in jobs:
            # For debugging.  If this file exists, don't filter it again.
            if os.path.exists(j.filtered_filename):
                continue
            args = j.in_filename, j.filtered_filename, wgs_or_wes
            x = vcflib.filter_vcf_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Make the cancer_samples files.
        for j in jobs:
            # Will generate this if there are cancer samples.
            make_cancer_samples_file(j.filtered_filename, nc_match,
                                     j.samples_file)

        # Make a list of commands.
        commands = []
        for j in jobs:
            cancer = False
            if os.path.exists(j.samples_file):
                cancer = True
            x = make_snpeff_command(j.filtered_filename,
                                    genome,
                                    j.out_filename,
                                    j.log_filename,
                                    is_cancer=cancer,
                                    cancer_samples_file=j.samples_file)
            commands.append(x)

        nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        # Make sure the analysis completed successfully.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Log files should be empty.
        for j in jobs:
            filelib.assert_exists(j.log_filename)
            assert not filelib.exists_nz(j.log_filename), \
                   "Error with %s.\n%s" % (j.stem, j.log_filename)
            filelib.safe_unlink(j.log_filename)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        #from Betsy import read_label_file
        from genomicode import filelib
        from genomicode import config
        #from genomicode import arraysetlib

        cls_node_train, data_node_train = antecedents
        #result, label_line, class_name = read_label_file.read(
        #    cls_node_train.identifier)
        #x = arraysetlib.read_cls_file(cls_node_train.identifier)
        #class_names, classes = x

        metadata = {}

        module_name = 'WeightedVotingXValidation'
        module_id_version = '00028:2'

        gp_params = dict()
        gp_params['data.filename'] = data_node_train.identifier
        gp_params['class.filename'] = cls_node_train.identifier
        if 'wv_num_features' in user_options:
            gp_params['num.features'] = str(user_options['wv_num_features'])

    ##    if 'wv_minstd' in user_input:
    ##    	assert module_utils.is_number(
    ##            user_input['wv_minstd']), 'the sv_minstd should be number'
    ##        gp_parameters['min.std'] = str(user_input['wv_minstd'])
    ##
    ##    wv_feature_stat = ['wv_snr', 'wv_ttest', 'wv_snr_median',
    ##                       'wv_ttest_median', 'wv_snr_minstd',
    ##                       'wv_ttest_minstd', 'wv_snr_median_minstd',
    ##                       'wv_ttest_median_minstd']
    ##
    ##    assert parameters['wv_feature_stat'] in wv_feature_stat, (
    ##            'the wv_feature_stat is invalid')
    ##    gp_parameters['feature.selection.statistic'] = str(
    ##            wv_feature_stat.index(parameters[
    ##                'wv_feature_stat']))

        gp_path = config.genepattern
        gp_module = filelib.which(gp_path)
        assert gp_module, 'cannot find the %s' % gp_path

        download_directory = os.path.join(".", 'wv_result')
        command = [
            gp_module,
            module_name,
            '--id_and_version',
            module_id_version,
            '-o',
            download_directory,
        ]
        for key in gp_params:
            x = ['--parameters', "%s:%s" % (key, gp_params[key])]
            command.extend(x)
        x = " ".join(map(str, x))
        metadata["commands"] = [x]

        # DEBUG: If this is already run, don't run it again.
        if not os.path.exists(download_directory):
            process = subprocess.Popen(command,
                                       shell=False,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            process.wait()
            error_message = process.communicate()[1]

            # Ignore warning:
            # /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/
            #   python2.7/site-packages/rpy2/rinterface/__init__.py:185:
            #   RRuntimeWarning: Loading required package: rJava
            #
            # warnings.warn(x, RRuntimeWarning)
            x = error_message.strip()
            if x.endswith("warnings.warn(x, RRuntimeWarning)"):
                pass
            else:
                assert not x, error_message
        filelib.assert_exists(download_directory)

        # Find the prediction file.
        x = os.listdir(download_directory)
        assert 'stderr.txt' not in x, 'gene_pattern get error'
        x = [x for x in x if x.endswith("pred.odf")]
        assert x, "Missing predictions file"
        assert len(x) == 1, "Too many prediction files: %s" % repr(x)
        gp_file = x[0]

        gp_file = os.path.join(download_directory, gp_file)
        text = open(gp_file).readlines()
        #os.rename(os.path.join(download_directory, gp_file),
        #          os.path.join(download_directory, 'prediction.odf'))
        assert text[1][0:12] == 'HeaderLines='
        start = int(text[1][12:-1])
        newresult = [[
            'Sample_name', 'Predicted_class', 'Confidence', 'Actual_class',
            'Correct?'
        ]]
        for i in text[start + 2:]:
            line = i.split()
            n = len(line)
            newline = [
                ' '.join(line[0:n - 4]), line[n - 3], line[n - 2], line[n - 4],
                line[n - 1]
            ]
            newresult.append(newline)

        f = file(outfile, 'w')
        for i in newresult:
            f.write('\t'.join(i))
            f.write('\n')
        f.close()

        return metadata
Exemplo n.º 9
0
def main():
    import os
    import argparse
    import itertools

    from genomicode import filelib
    from genomicode import config
    from genomicode import parallel
    from genomicode import alignlib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("reference_genome", help="fasta file")

    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")
    parser.add_argument(
        "--dry_run",
        action="store_true",
        help="Just display the commands, and don't generate the alignment.")
    parser.add_argument("--window",
                        default=80,
                        type=int,
                        help="Number of bases in alignment.  Default: 80")

    group = parser.add_argument_group(title="Input")
    group.add_argument("--bam_file", help="Indexed BAM file.")
    group.add_argument("--bam_path", help="Path to BAM files.")
    group.add_argument(
        "--position",
        action="append",
        default=[],
        help="Specify a position to view, "
        "e.g. chr20:45,927,663 or chr20:45927663.  1-based coordinates")
    group.add_argument("--position_file",
                       help="Tab-delimited text file with two columns.  "
                       "Column 1 is chromosome, column 2 is position.")

    group = parser.add_argument_group(title="Output")
    group.add_argument("--prefix", help="Pre-pend a prefix to each outfile.")
    group.add_argument(
        "--outpath",
        help="If multiple alignments are generated, this option "
        "directs where to save the output files.")
    group.add_argument(
        "--noclobber",
        action="store_true",
        help="If an output file already exists, don't overwrite it.")

    # Parse the input arguments.
    args = parser.parse_args()
    filelib.assert_exists_nz(args.reference_genome)
    assert args.bam_file or args.bam_path, \
           "Either --bam_file or --bam_path must be provided."
    assert not (args.bam_file and args.bam_path), \
           "Cannot specify both --bam_file or --bam_path."
    if args.bam_file:
        filelib.assert_exists_nz(args.bam_file)
    if args.bam_path:
        assert os.path.exists(args.bam_path)
    if args.position_file:
        filelib.assert_exists_nz(args.position_file)
    if args.outpath and not os.path.exists(args.outpath):
        os.mkdir(args.outpath)
    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.window >= 1 and args.window < 500

    bam_filenames = []
    if args.bam_file:
        bam_filenames.append(args.bam_file)
    else:
        x = os.listdir(args.bam_path)
        x = [x for x in x if x.endswith(".bam")]
        x = [os.path.join(args.bam_path, x) for x in x]
        bam_filenames = x
    assert bam_filenames, "No bam files found."

    positions = []  # list of (chrom, pos)
    for x in args.position:
        chrom, pos = _parse_position(x)
        positions.append((chrom, pos))
    if args.position_file and os.path.exists(args.position_file):
        for cols in filelib.read_cols(args.position_file):
            assert len(cols) == 2, "Position file should have 2 columns"
            chrom, pos = cols
            pos = int(pos)
            assert pos >= 1
            positions.append((chrom, pos))
    assert positions, "No positions specified."

    # Make the commands.
    assert hasattr(config, "samtools")
    filelib.assert_exists(config.samtools)

    # Make sure we have the right version of samtools.
    # 1.2 (using htslib 1.2.1)
    # 0.1.18 (r982:295)
    version = alignlib.get_samtools_version()
    x = version.split(".")
    assert len(x) >= 2
    major = x[0]
    assert major in ["0", "1"], "Unknown samtools version: %s" % version
    major = int(major)
    assert major >= 1, "Requires samtools >= 1 (Current version: %s)" % version

    commands = []
    for x in itertools.product(bam_filenames, positions):
        bam_filename, (chrom, pos) = x

        p, f = os.path.split(bam_filename)
        sample, e = os.path.splitext(f)

        left = max(pos - args.window / 2, 1)
        pos_str = "%s:%s" % (chrom, left)

        x = "%2s.%9s.%s.html" % (chrom, pos, sample)
        if args.prefix:
            x = "%s.%s" % (args.prefix, x)
        if args.outpath:
            x = os.path.join(args.outpath, x)
        out_filename = x

        if args.noclobber and os.path.exists(out_filename):
            continue

        # samtools tview -d t -p 7:100550778 bam01/196B-lung.bam $FA
        sq = parallel.quote
        x = [
            sq(config.samtools),
            "tview",
            "-d",
            "h",
            "-p",
            pos_str,
            sq(bam_filename),
            sq(args.reference_genome),
        ]
        x = " ".join(x)
        x = "%s >& %s" % (x, sq(out_filename))
        commands.append(x)

    if args.dry_run:
        for x in commands:
            print x
        return

    parallel.pshell(commands, max_procs=args.num_procs)
Exemplo n.º 10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import parselib
        from Betsy import module_utils as mlib

        mpileup_node, nc_node = antecedents
        mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier,
                                                       endswith=".pileup")
        assert mpileup_filenames, "No .pileup files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        #ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]

        sample2pufile = {}  # sample -> mpileup filename
        for filename in mpileup_filenames:
            path, sample, ext = mlib.splitpath(filename)
            sample2pufile[sample] = filename

        # Make sure files exist for all the samples.
        all_samples = []
        for (normal_sample, cancer_sample) in nc_match:
            if normal_sample not in all_samples:
                all_samples.append(normal_sample)
            if cancer_sample not in all_samples:
                all_samples.append(cancer_sample)
        missing = [x for x in all_samples if x not in sample2pufile]
        x = parselib.pretty_list(missing, max_items=5)
        assert not missing, "Missing BAM files for samples: %s" % x

        # list of (sample, normal_pileup, cancer_pileup,
        #          tmp1_normal, tmp1_cancer, log_filename, out_filename)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_pileup = sample2pufile[normal_sample]
            cancer_pileup = sample2pufile[cancer_sample]
            p, sample, ext = mlib.splitpath(cancer_pileup)
            tmp1_normal = opj(out_path, "%s.normal.tmp1" % sample)
            tmp1_cancer = opj(out_path, "%s.cancer.tmp1" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            out_filename = opj(out_path, "%s.vcf" % sample)
            x = sample, normal_sample, cancer_sample, \
                normal_pileup, cancer_pileup, \
                tmp1_normal, tmp1_cancer, log_filename, out_filename
            jobs.append(x)

        # VarScan will generate a "Parsing Exception" if there are 0
        # reads in a location.  Will be either "0" or blank.  Filter
        # those lines out.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            x1 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (normal_pileup,
                                                           tmp1_normal)
            x2 = "awk -F'\t' '$4 >= 1 {print}' %s > %s" % (cancer_pileup,
                                                           tmp1_cancer)
            commands.extend([x1, x2])
        parallel.pshell(commands, max_procs=num_cores)
        x = [x[3] for x in jobs] + [x[4] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # java -jar VarScan.jar somatic [normal_pileup] [tumor_pileup]
        #   [output] OPTIONS
        varscan = mlib.findbin("varscan_jar")

        # Use parameters from:
        # Using VarScan 2 for Germline Variant Calling and Somatic
        # Mutation Detection

        # Make a list of commands.
        commands = []
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            x = [
                "java",
                "-jar",
                sq(varscan),
                "somatic",
                sq(tmp1_normal),
                sq(tmp1_cancer),
                sample,
                "--min-coverage",
                10,
                "--min-avg-qual",
                15,
                "--min-normal-coverage",
                10,
                "--min-tumor-coverage",
                10,
                "--min-var-freq",
                0.05,
                "--somatic-p-value",
                0.05,
                "--output-vcf",
                1,
            ]
            x = " ".join(map(str, x))
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)
        x = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Files in out_path can get very big.  Clean them up.
        # <sample>.normal.tmp1    Very big (10's Gb).
        # <sample>.cancer.tmp1    Very big (10's to 100 Gb).
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            if os.path.exists(tmp1_normal):
                os.unlink(tmp1_normal)
            if os.path.exists(tmp1_cancer):
                os.unlink(tmp1_cancer)

        # Copy the final file to the right place.
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            # Will be written in current directory.
            varscan_out = "%s.snp.vcf" % sample
            if vartype == "indel":
                varscan_out = "%s.indel.vcf" % sample
            filelib.assert_exists(varscan_out)
            shutil.copy2(varscan_out, out_filename)

        # VarScan names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for x in jobs:
            sample, normal_sample, cancer_sample, \
                    normal_pileup, cancer_pileup, \
                    tmp1_normal, tmp1_cancer, log_filename, out_filename = x
            _fix_normal_cancer_names(out_filename, normal_sample,
                                     cancer_sample)
Exemplo n.º 11
0
def run_module(network,
               input_ids,
               module_id,
               out_data_node,
               all_user_options,
               pool,
               transitions,
               user,
               job_name='',
               clean_up=True,
               num_cores=8,
               verbosity=0):
    # Return tuple of (output_path, IdentifiedDataNode, node_id,
    # elapsed time) for the node that was created.  Returns None if
    # this module fails (no compatible output nodes, or all output
    # nodes already generated).

    import os
    import sys
    import time
    import logging

    from genomicode import filelib
    from Betsy import config
    from Betsy import bie3

    assert user
    output_path = config.CACHE_PATH
    filelib.assert_exists(output_path)

    # Import module.
    module_node, module = _import_module(network, module_id)
    assert len(module_node.in_datatypes) == len(input_ids)
    module_name = module_node.name

    # Get the antecedents from the pool.
    antecedents = [pool[x] for x in input_ids]
    if len(antecedents) == 1:
        antecedents = antecedents[0]

    # Get the user_options for this module.  all_user_options contains
    # all options provided by the user.  Pull out the ones relevant
    # for this module.  Use the defaults when necessary.
    user_options = {}
    for option in module_node.option_defs:
        value = all_user_options.get(option.name)
        if value is None:
            value = option.default
        assert value is not None, "Missing input: %s" % option.name
        user_options[option.name] = value

    # Set up the directories and outfile.
    # Unfortunately, can't use timestamp in pathname, or else this
    # will never re-use prior analyses.  Have to be more clever about
    # this.
    h = _hash_module(module_name, antecedents, out_data_node.attributes,
                     user_options)
    ## Get milliseconds.
    #x = time.time()
    #ms = int((x-int(x))*100)
    #ts = time.strftime("%y%m%d.%H%M%S", time.localtime())
    #x = "%s.%02d__%s__B%03d__%s" % (ts, ms, module_name, VERSION, h)
    x = "%s__B%03d__%s" % (module_name, VERSION, h)
    result_dir = os.path.join(output_path, x)
    outfile = module.name_outfile(antecedents, user_options)

    # Create the IdentifiedDataNode that will be the output once the
    # module has been run.
    full_outfile = os.path.join(result_dir, outfile)
    #x = bie3.DataNode(network.nodes[next_id].datatype, **out_data.attributes)
    out_identified_data_node = bie3.IdentifiedDataNode(out_data_node,
                                                       full_outfile)

    #time_str = "[%s]" % time.strftime('%I:%M %p')
    time_str = "[%s]" % time.strftime('%a %I:%M %p')

    # Check if this has already been run.
    if _is_module_output_complete(result_dir):
        # Update timestamp on LAST_ACCESSED_FILE.
        open(os.path.join(result_dir, LAST_ACCESSED_FILE), 'w')
        # Read parameter file.
        filename = os.path.join(result_dir, BETSY_PARAMETER_FILE)
        assert os.path.exists(filename)
        params = _read_parameter_file(filename)
        elapsed = params["elapsed"]
        run_time = params["elapsed_pretty"]
        if run_time == "instant":
            x = "ran instantly"
        else:
            x = "took %s" % run_time
        x = "%s  %s (CACHED, previously %s)" % (time_str, module_name, x)
        #parselib.print_split(x, prefixn=2)
        print x
        if verbosity >= 1:
            # Print out the output directory.
            indent = len("[Thu 10:06 PM]  ")
            x = os.path.split(result_dir)[1]
            print "%s%s" % (" " * indent, x)
        sys.stdout.flush()
        return result_dir, out_identified_data_node, elapsed

    #_debug_is_module_output_complete(
    #    module_name, antecedents, out_data_node.attributes, user_options,
    #    VERSION, h, result_dir)

    # Running this module now.
    x = "%s  %s" % (time_str, module_name)
    #parselib.print_split(x, prefixn=2)
    print x
    if verbosity >= 1:
        # Print out the output directory.
        indent = len("[Thu 10:06 PM]  ")
        x = os.path.split(result_dir)[1]
        print "%s%s" % (" " * indent, x)
    sys.stdout.flush()

    # Run the analysis.  If someone else is currently running the same
    # analysis, then wait for them to finish.  However, if they have
    # somehow failed, then delete the incomplete results and start
    # over.
    #
    # 1.  Create directory.
    # 2.  Write in_progress.txt.
    # 3.  Run the analysis.  Refresh in_progress.txt every 5 sec.
    # 4.  Write finished.txt.
    # 5.  Stop refreshing in_progress.txt.
    #
    # IN_PROGRESS  FINISHED    INTERPRETATION
    #    missing    missing    Starting analysis?  Wait 5 sec, check again.
    #                          If everything still missing, then overwrite.
    #    missing    present    Complete analysis.
    # <5 sec old    missing    Still copying.  Wait.
    # <5 sec old    present    Finishing up.  Consider complete.
    # >5 sec old    missing    Abandoned.  Overwrite.
    # >5 sec old    present    Should not happen.  rm copying.txt, check
    #                          after 5 sec.  If missing, consider
    #                          complete.  If back, consider error.
    REFRESH = 5  # number of seconds to refresh copying.txt file.
    success_file = os.path.join(result_dir, FINISHED_FILE)
    last_accessed_file = os.path.join(result_dir, LAST_ACCESSED_FILE)
    copying_file = os.path.join(result_dir, IN_PROGRESS_FILE)
    exists = os.path.exists

    i_run_analysis = None
    while i_run_analysis is None:
        # Try to make the result_dir.  If I make it, then I should run the
        # analysis.  Otherwise, someone else has priority.  Let them run
        # the analysis.

        if not os.path.exists(result_dir):
            try:
                os.mkdir(result_dir)
                i_run_analysis = True
                break
            except OSError, x:
                pass

        # For debugging.  If I want to re-run the module over
        # the old one, then just keep this directory.
        if not CLEAN_UP_PATH_FOR_NEW_MODULE:
            i_run_analysis = True
            break

        last_refresh = None
        if exists(copying_file):
            last_refresh = time.time() - os.path.getctime(copying_file)

        if not exists(copying_file) and not exists(success_file):
            # BUG: This doesn't work.  What if this was abandoned, but
            # somebody else just happens to create the directory again
            # while I'm checking?  Will have result_dir, but nothing
            # inside it.
            # SOLUTION: Give them a cycle to create something.
            # DEBUG: Skip this.
            #i_run_analysis = True; break
            time.sleep(REFRESH + 1)
            if not exists(copying_file) and not exists(success_file):
                # Abandoned.  Delete the result dir and try again.
                if CLEAN_UP_PATH_FOR_NEW_MODULE:
                    _rmtree_multi(result_dir)
        elif not exists(copying_file) and exists(success_file):
            # Previous run is now finished.
            i_run_analysis = False
        # From here on down, copying_file should exist.
        elif last_refresh < REFRESH and not exists(success_file):
            # Still copying.  Wait.
            time.sleep(REFRESH)
        elif last_refresh < REFRESH and exists(success_file):
            # Finishing up.  Consider complete.
            i_run_analysis = False
        elif last_refresh >= REFRESH * 2 and not exists(success_file):
            # Steal the file.  This can cause a lot of problems, so
            # don't do this unless we're sure the other process is
            # really dead.
            # Abandoned.  Delete the result dir and try again.
            if CLEAN_UP_PATH_FOR_NEW_MODULE:
                _rmtree_multi(result_dir)
        elif last_refresh >= REFRESH * 2 and exists(success_file):
            os.unlink(copying_file)
            time.sleep(REFRESH)
            # Should not be coming back if analysis has already
            # completed successfully.
            assert not exists(copying_file), "Zombie in progress file"
            # At this point, no copying_file, but there is a
            # success_file.  Consider this analysis complete.
            i_run_analysis = False
        else:
            # Does not fit one of these.
            time.sleep(REFRESH)