예제 #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import sortlib
        from Betsy import module_utils as mlib

        # Should be a folder of fastqc results.
        fastqc_path = in_data.identifier

        # Find all the FASTQC results.
        x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt")
        x = [os.path.split(x)[0] for x in x]
        paths = x
        assert paths, "No FASTQC files found."

        # Read the results.
        all_results = [read_fastqc_results(x) for x in paths]
        assert all_results

        # Make table where the rows are the samples and the columns
        # are the statistics.
        sample2results = {}
        for x in all_results:
            assert x.sample not in sample2results
            sample2results[x.sample] = x
        all_statistics = all_results[0].statistics_order
        all_samples = sortlib.sort_natural(sample2results)

        table = []
        header = [
            "Sample", "Total Sequences", "Filtered Sequences",
            "Sequence length", "GC"
        ] + all_statistics
        table.append(header)
        for sample in all_samples:
            results = sample2results[sample]
            x1 = [sample]
            x2 = [
                results.total_sequences, results.filtered_sequences,
                results.sequence_length, results.percent_gc
            ]
            x3 = [results.statistics[x] for x in all_statistics]
            x = x1 + x2 + x3
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "fastqc_summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(map(str, x))
        handle.close()

        x = mlib.get_config("txt2xls", which_assert_file=True, quote=True)
        os.system("%s -b %s > %s" % (x, TXT_FILE, outfile))
        filelib.assert_exists_nz(outfile)
예제 #2
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        from genomicode import filelib

        vcf_node = in_data
        # Some callers, like jointsnvmix, will create vcf files for
        # each chromosome.  To avoid picking these up, only accept
        # .vcf files from the top level.
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   toplevel_only=True)
        assert vcf_filenames, "No .vcf files: %s" % vcf_node.identifier
        metadata = {}

        tmp_path = "indexed.vcf"
        m = merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path)
        metadata.update(m)
        filelib.assert_exists(out_filename)  # may be size 0

        return metadata
예제 #3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel

        vcf_node = in_data
        vcf_files = filelib.list_files_in_path(vcf_node.identifier,
                                               endswith=".vcf",
                                               case_insensitive=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # in_vcf_filename, out_vcf_filename
        for vcf_file in vcf_files:
            path, file_ = os.path.split(vcf_file)
            out_vcf_file = os.path.join(out_path, file_)
            x = vcf_file, out_vcf_file
            jobs.append(x)

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel"]

        # Generate the commands.
        commands = []
        for x in jobs:
            in_vcf_file, out_vcf_file = x

            args = vartype, in_vcf_file, out_vcf_file
            x = filter_by_vartype, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
예제 #4
0
def find_vcf_files(vcf_path):
    # Return list of (<sample>, <filename>).

    import os
    from genomicode import filelib
    #from genomicode import vcflib

    filenames = filelib.list_files_in_path(vcf_path,
                                           endswith=".vcf",
                                           case_insensitive=True)

    # Format:
    # <path>/<sample>.vcf
    vcf_files = []
    for filename in filenames:
        p, f = os.path.split(filename)
        sample = os.path.splitext(f)[0]
        #caller = vcflib.identify_caller(filename)
        #assert caller is not None, "Unknown caller: %s" % filename
        x = sample, filename
        vcf_files.append(x)
    return vcf_files
예제 #5
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils

        path = module_utils.unzip_if_zip(in_data.identifier)
        x = filelib.list_files_in_path(path)
        x = [x for x in x if x.lower().endswith(".idat")]
        assert x, "No idat files."
        in_filenames = x

        if not os.path.exists(out_path):
            os.mkdir(out_path)
        for in_filename in in_filenames:
            in_path, in_file = os.path.split(in_filename)
            file_, ext = os.path.splitext(in_file)
            if file_.endswith("_Grn"):
                file_ = file_[:-4]
            out_file = "%s%s" % (file_, ext)
            out_filename = os.path.join(out_path, out_file)
            shutil.copyfile(in_filename, out_filename)
예제 #6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        vcf_node, nc_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Filenames:
        # <caller>.vcf

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        genome = mlib.get_user_option(user_options,
                                      "snpeff_genome",
                                      not_empty=True)
        databases = list_snpeff_databases()
        assert genome in databases, "Unknown genome database: %s" % genome

        # For each caller, do the SnpEFF calls.  Some callers include
        # the somatic information, others do not.  If germline samples
        # are present, then do with _cancer.  Otherwise, do not.

        # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt
        #   GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log

        # Don't bother annotating positions that do not pass filter.
        # Filter them out first based on FILTER column.

        opj = os.path.join
        jobs = []
        for in_filename in vcf_filenames:
            path, stem, ext = mlib.splitpath(in_filename)
            samples_file = opj(out_path, "%s.cancerSamples.txt" % stem)
            filtered_filename = opj(out_path, "%s.filtered_input" % stem)
            out_filename = opj(out_path, "%s.vcf" % stem)
            log_filename = opj(out_path, "%s.log" % stem)
            x = filelib.GenericObject(in_filename=in_filename,
                                      samples_file=samples_file,
                                      filtered_filename=filtered_filename,
                                      out_filename=out_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # First, filter each of the VCF files.
        commands = []
        for j in jobs:
            # For debugging.  If this file exists, don't filter it again.
            if os.path.exists(j.filtered_filename):
                continue
            args = j.in_filename, j.filtered_filename, wgs_or_wes
            x = vcflib.filter_vcf_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Make the cancer_samples files.
        for j in jobs:
            # Will generate this if there are cancer samples.
            make_cancer_samples_file(j.filtered_filename, nc_match,
                                     j.samples_file)

        # Make a list of commands.
        commands = []
        for j in jobs:
            cancer = False
            if os.path.exists(j.samples_file):
                cancer = True
            x = make_snpeff_command(j.filtered_filename,
                                    genome,
                                    j.out_filename,
                                    j.log_filename,
                                    is_cancer=cancer,
                                    cancer_samples_file=j.samples_file)
            commands.append(x)

        nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        # Make sure the analysis completed successfully.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Log files should be empty.
        for j in jobs:
            filelib.assert_exists(j.log_filename)
            assert not filelib.exists_nz(j.log_filename), \
                   "Error with %s.\n%s" % (j.stem, j.log_filename)
            filelib.safe_unlink(j.log_filename)

        return metadata
예제 #7
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.  Also,
        # takes 4-5 Gb RAM per process.
        MAX_CORES = mlib.calc_max_procs_from_ram(5, upper_max=4)

        fastq_node, sample_node, summary_node = antecedents
        fastq_path = fastq_node.identifier
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_path)
        assert fastq_files, "I could not find any FASTQ files."
        summary_filenames = filelib.list_files_in_path(
            summary_node.identifier, endswith=".matches.txt")
        assert summary_filenames, "No .matches.txt files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_mismatches = mlib.get_user_option(
            user_options, "num_mismatches", type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2summary = {}  # sample -> summary_filename
        for filename in summary_filenames:
            # <sample>.matches.txt
            p, f = os.path.split(filename)
            assert f.endswith(".matches.txt")
            sample = f.replace(".matches.txt", "")
            assert sample not in sample2summary
            sample2summary[sample] = filename

        # list of (sample, fastq_file1, fastq_file2, summary_filename,
        #          out_file1, out_file2, subtracted_file1, subtracted_file2)
        jobs = []
        for x in fastq_files:
            sample, pair1_fastq, pair2_fastq = x
            assert sample in sample2summary, \
                   "Missing summary for sample: %s" % sample
            p1, f1 = os.path.split(pair1_fastq)
            if pair2_fastq:
                p2, f2 = os.path.split(pair2_fastq)
                assert p1 == p2
            out1_fastq = os.path.join(out_path, f1)
            sub1_fastq = os.path.join(out_path, "%s.subtracted" % f1)
            out2_fastq = None
            sub2_fastq = None
            if pair2_fastq:
                out2_fastq = os.path.join(out_path, f2)
                sub2_fastq = os.path.join(out_path, "%s.subtracted" % f2)
            x = sample, pair1_fastq, pair2_fastq, sample2summary[sample], \
                out1_fastq, out2_fastq, sub1_fastq, sub2_fastq
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, pair1_fastq, pair2_fastq, summary_file, \
                    out1_fastq, out2_fastq, sub1_fastq, sub2_fastq = x
            x = summary_file, pair1_fastq, out1_fastq, sub1_fastq, \
                num_mismatches
            x = subtract_mouse_reads, x, {}
            jobs2.append(x)
            if pair2_fastq:
                x = summary_file, pair2_fastq, out2_fastq, sub2_fastq, \
                    num_mismatches
                x = subtract_mouse_reads, x, {}
                jobs2.append(x)

        nc = min(MAX_CORES, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.5)
        assert len(results) == len(jobs2)
        metadata["num_cores"] = nc
        
        # Make sure the fastq files were generated.
        x1 = [x[4] for x in jobs]
        x2 = [x[5] for x in jobs]
        x = x1 + x2
        x = [x for x in x if x]
        # BUG: If all reads were removed, then this will fail incorrectly.
        filelib.assert_exists_nz_many(x)

        return metadata
예제 #8
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import alignlib

        count_path = in_data.identifier
        assert os.path.exists(count_path)
        assert os.path.isdir(count_path)
        result_files = filelib.list_files_in_path(count_path,
                                                  endswith=".count")
        assert result_files, "No .count files found."

        # Parse the count files.
        name2results = {}
        for filename in result_files:
            x = os.path.split(filename)[1]
            x = os.path.splitext(x)[0]
            name = x
            assert name not in name2results
            x = alignlib.parse_htseq_count_output(filename)
            name2results[name] = x
        assert name2results, "No samples"

        # Assemble into a summary matrix.
        # Rows:
        # no_feature
        # ambiguous
        # too_low_aQual
        # not_aligned
        # alignment_not_unique
        # total_mapped
        # total_fragments
        # percent_mapped

        ROWS = [
            "no_feature",
            "ambiguous",
            "too_low_aQual",
            "not_aligned",
            "alignment_not_unique",
        ]
        all_names = sorted(name2results)

        matrix = []
        header = ["Feature"] + all_names
        matrix.append(header)
        for rn in ROWS:
            x = [rn] + [getattr(name2results[n], rn) for n in all_names]
            assert len(x) == len(header)
            matrix.append(x)

        # Count the total mapped and total_fragments.
        total_mapped = []
        total_fragments = []
        perc_mapped = []
        perc_no_feature = []
        perc_ambiguous = []
        for n in all_names:
            # Sum up the counts
            results = name2results[n]
            tm, tf, pm = "", "", ""
            pnf, pamb = "", ""
            if not results.errors:
                x1 = sum(results.counts.values())
                x2 = 0
                for rn in ROWS:
                    x2 += getattr(results, rn)
                tm = x1
                tf = x1 + x2
                pm = tm / float(tf)
                pnf = results.no_feature / float(tf)
                pamb = results.ambiguous / float(tf)
            total_mapped.append(tm)
            total_fragments.append(tf)
            perc_mapped.append(pm)
            perc_no_feature.append(pnf)
            perc_ambiguous.append(pamb)

        x1 = ["total_mapped"] + total_mapped
        x2 = ["total_fragments"] + total_fragments
        x3 = ["perc_mapped"] + perc_mapped
        x4 = ["perc_no_feature"] + perc_no_feature
        x5 = ["perc_ambiguous"] + perc_ambiguous
        assert len(x1) == len(header)
        assert len(x2) == len(header)
        assert len(x3) == len(header)
        assert len(x4) == len(header)
        assert len(x5) == len(header)
        matrix.append(map(str, x1))
        matrix.append(map(str, x2))
        matrix.append(map(str, x3))
        matrix.append(map(str, x4))
        matrix.append(map(str, x5))

        # Write the data file.
        handle = open(outfile, 'w')
        for x in matrix:
            print >> handle, "\t".join(map(str, x))
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        svm_node, vcf_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        metadata = {}

        # 1.  vcf_filenames
        # 2.  parsed_snpeff_files   one for each VCF file
        # 3.  merged_snpeff_file    just one file
        # 4.  clean_snpeff_file     clean up the annotations to final form
        # 5.  outfile

        merged_snpeff_file = "snpeff.merged.txt"
        cleaned_snpeff_file = "snpeff.clean.txt"

        jobs = []
        for vcf_filename in vcf_filenames:
            path, caller, ext = mlib.splitpath(vcf_filename)
            parsed_snpeff_file = "%s.parsed.txt" % caller
            j = filelib.GenericObject(
                caller=caller,
                vcf_filename=vcf_filename,
                parsed_snpeff_file=parsed_snpeff_file,
            )
            jobs.append(j)

        # Parse each of the snpeff files.
        commands = []
        for j in jobs:
            args = j.vcf_filename, j.parsed_snpeff_file
            # Debugging.  If this file exists, do not generate it
            # again.
            if os.path.exists(j.parsed_snpeff_file):
                continue
            x = parse_snpeff_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        # Merge the parsed files.
        x = [j.parsed_snpeff_file for j in jobs]
        x = [x for x in x if os.path.exists(x)]
        parsed_files = x
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(merged_snpeff_file):
            merge_parsed_files(parsed_files, merged_snpeff_file)

        # Clean up the snpEff file.  Coordinates should be unique.
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(cleaned_snpeff_file):
            clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file)

        # Merge the snpEff annotations into the SimpleVariantMatrix.
        add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile)

        return metadata
예제 #10
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sai_node, orient_node, sample_node, reference_node = \
                    antecedents
        fastq_files = mlib.find_merged_fastq_files(
            sample_node.identifier, fastq_node.identifier)
        sai_path = sai_node.identifier
        assert filelib.dir_exists(sai_path)
        orient = mlib.read_orientation(orient_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Technically, doesn't need the SampleGroupFile, since that's
        # already reflected in the sai data.  But better, because the
        # sai data might not always be generated by BETSY.

        # Find the merged fastq files.

        # Find the sai files.
        sai_filenames = filelib.list_files_in_path(
            sai_path, endswith=".sai", case_insensitive=True)
        assert sai_filenames, "No .sai files."

        bwa = mlib.findbin("bwa")
        # bwa samse -f <output.sam> <reference.fa> <input.sai> <input.fq>
        # bwa sampe -f <output.sam> <reference.fa> <input_1.sai> <input_2.sai>
        #   <input_1.fq> <input_2.fq> >

        # list of (pair1.fq, pair1.sai, pair2.fq, pair2.sai, output.sam)
        # all full paths
        jobs = []
        for x in fastq_files:
            sample, pair1_fq, pair2_fq = x

            # The sai file should be in the format:
            # <sai_path>/<sample>.sai    Single end read
            # <sai_path>/<sample>_1.sai  Paired end read
            # <sai_path>/<sample>_2.sai  Paired end read
            # Look for pair1_sai and pair2_sai.
            pair1_sai = pair2_sai = None
            for sai_filename in sai_filenames:
                p, s, e = mlib.splitpath(sai_filename)
                assert e == ".sai"
                if s == sample:
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_1" % (sample):
                    assert not pair1_sai
                    pair1_sai = sai_filename
                elif s == "%s_2" % (sample):
                    assert not pair2_sai
                    pair2_sai = sai_filename
            assert pair1_sai, "Missing .sai file: %s" % sample
            if pair2_fq:
                assert pair2_sai, "Missing .sai file 2: %s" % sample
            if pair2_sai:
                assert pair2_fq, "Missing .fq file 2: %s" % sample
                
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                sam_filename, log_filename
            jobs.append(x)

        orientation = orient.orientation
        #orientation = sample_node.data.attributes["orientation"]
        assert orientation in ["single", "paired_fr", "paired_rf"]

        # Make a list of bwa commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1_fq, pair1_sai, pair2_fq, pair2_sai, \
                    sam_filename, log_filename = x
            if orientation == "single":
                assert not pair2_fq
                assert not pair2_sai

            samse = "samse"
            if orientation.startswith("paired"):
                samse = "sampe"

            x = [
                sq(bwa),
                samse,
                "-f", sq(sam_filename),
                sq(ref.fasta_file_full),
                ]
            if orientation == "single":
                x += [
                    sq(pair1_sai),
                    sq(pair1_fq),
                ]
            else:
                y = [
                    sq(pair1_sai),
                    sq(pair2_sai),
                    sq(pair1_fq),
                    sq(pair2_fq),
                    ]
                if orientation == "paired_rf":
                    y = [
                        sq(pair2_sai),
                        sq(pair1_sai),
                        sq(pair2_fq),
                        sq(pair1_fq),
                        ]
                x += y
            x += [
                ">&", sq(log_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)
        
        return metadata