예제 #1
0
def _update_and_check_max_read_depth(options, mkfile):
    if any(subdd["VCF_Filter"]["MaxReadDepth"] == "auto"
           for subdd in mkfile["Genotyping"].itervalues()):
        print_info("    - Determinining max-depth from depth-histograms ...")

    for (key, settings) in mkfile["Genotyping"].iteritems():
        required_keys = set()
        for sample in mkfile["Project"]["Samples"].itervalues():
            if sample["GenotypingMethod"].lower() == "samtools":
                required_keys.add(sample["Name"])

        max_depths = settings["VCF_Filter"]["MaxReadDepth"]
        if isinstance(max_depths, types.DictType):
            # Extra keys are allowed, to make it easier
            # to temporarily disable a sample
            missing_keys = required_keys - set(max_depths)
            if missing_keys:
                missing_keys = "\n    - ".join(sorted(missing_keys))
                message = "MaxReadDepth not specified for the following " \
                          "samples for %r:\n    - %s" % (key, missing_keys)
                raise MakefileError(message)

        elif isinstance(max_depths, types.StringTypes):
            assert max_depths.lower() == "auto", max_depths
            prefix = mkfile["Project"]["Regions"][key]["Prefix"]

            settings["VCF_Filter"]["MaxReadDepth"] \
                = _read_max_depths(options, prefix, required_keys)
        else:
            max_depths = dict.fromkeys(required_keys, max_depths)
            settings["VCF_Filter"]["MaxReadDepth"] = max_depths
예제 #2
0
파일: ui.py 프로젝트: muslih14/paleomix
 def flush(self):
     """See BaseUI.flush."""
     if BaseUI.flush(self) and self._running_nodes:
         self._print_header(self.states, self.threads)
         for node in sorted(map(str, self._running_nodes)):
             print_info("  - %s" % (node,), file=sys.stdout)
         print_info(file=sys.stdout)
예제 #3
0
def run_admix_pipeline(config):
    print_info("\nBuilding %i Zonkey pipeline(s):" % (len(config.samples),))
    config.temp_root = os.path.join(config.destination, "temp")
    if not config.dry_run:
        fileutils.make_dirs(config.temp_root)

    cache = {}
    nodes = []
    items = config.samples.iteritems()
    for idx, (name, sample) in enumerate(sorted(items), start=1):
        root = sample["Root"]
        nuc_bam = sample["Files"].get("Nuc")
        mito_bam = sample["Files"].get("Mito")

        genomes = []
        if mito_bam:
            genomes.append("MT")
        if nuc_bam:
            genomes.append("Nuclear")

        print_info("  %i. %s: %s DNA" % (idx, name, ' and '.join(genomes)))

        nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))

    if config.multisample and not config.admixture_only:
        nodes = [summary.SummaryNode(config, nodes)]

    if not run_pipeline(config, nodes, "\nRunning Zonkey:"):
        return 1
예제 #4
0
def _update_and_check_max_read_depth(options, mkfile):
    if any(subdd["VCF_Filter"]["MaxReadDepth"] == "auto"
           for subdd in mkfile["Genotyping"].itervalues()):
        print_info("    - Determinining max-depth from depth-histograms ...")

    for (key, settings) in mkfile["Genotyping"].iteritems():
        required_keys = set()
        for sample in mkfile["Project"]["Samples"].itervalues():
            if sample["GenotypingMethod"].lower() == "samtools":
                required_keys.add(sample["Name"])

        max_depths = settings["VCF_Filter"]["MaxReadDepth"]
        if isinstance(max_depths, types.DictType):
            # Extra keys are allowed, to make it easier
            # to temporarily disable a sample
            missing_keys = required_keys - set(max_depths)
            if missing_keys:
                missing_keys = "\n    - ".join(sorted(missing_keys))
                message = "MaxReadDepth not specified for the following " \
                          "samples for %r:\n    - %s" % (key, missing_keys)
                raise MakefileError(message)

        elif isinstance(max_depths, types.StringTypes):
            assert max_depths.lower() == "auto", max_depths
            prefix = mkfile["Project"]["Regions"][key]["Prefix"]

            settings["VCF_Filter"]["MaxReadDepth"] \
                = _read_max_depths(options, prefix, required_keys)
        else:
            max_depths = dict.fromkeys(required_keys, max_depths)
            settings["VCF_Filter"]["MaxReadDepth"] = max_depths
예제 #5
0
파일: ui.py 프로젝트: jelber2/paleomix
    def finalize(self):
        """Called by the pipeline at the termination of a run. By default,
        this function prints the location of the log-file if one was created
        during the run (e.g. if there were errors), and a summary of all nodes.
        """
        runtime = (self._end_time or 0) - (self.start_time or 0)

        if self.states[self.ERROR]:
            print_err("Done; but errors were detected ...")
        else:
            print_info("Done ...")

        print_info()
        rows = [("  Number of nodes:", sum(self.states)),
                ("  Number of done nodes:", self.states[self.DONE]),
                ("  Number of runable nodes:", self.states[self.RUNABLE]),
                ("  Number of queued nodes:", self.states[self.QUEUED]),
                ("  Number of outdated nodes:", self.states[self.OUTDATED]),
                ("  Number of failed nodes:", self.states[self.ERROR]),
                ("  Pipeline runtime:", _fmt_runtime(runtime))]

        for line in text.padded_table(rows):
            print_info(line)

        print_info("\nUse --list-output-files to view status of output files.")

        logfile = paleomix.logger.get_logfile()
        if logfile:
            print_debug("Log-file located at %r" % (logfile, ))

        print_info()
예제 #6
0
파일: ui.py 프로젝트: jelber2/paleomix
 def flush(self):
     """See BaseUI.flush."""
     if BaseUI.flush(self) and self._running_nodes:
         self._print_header()
         for node in sorted(map(str, self._running_nodes)):
             print_info("  - %s" % (node, ), file=sys.stdout)
         print_info(file=sys.stdout)
예제 #7
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
예제 #8
0
def run_admix_pipeline(config):
    print_info("\nBuilding %i Zonkey pipeline(s):" % (len(config.samples),))
    config.temp_root = os.path.join(config.destination, "temp")
    if not config.dry_run:
        fileutils.make_dirs(config.temp_root)

    cache = {}
    nodes = []
    items = config.samples.iteritems()
    for idx, (name, sample) in enumerate(sorted(items), start=1):
        root = sample["Root"]
        nuc_bam = sample["Files"].get("Nuc")
        mito_bam = sample["Files"].get("Mito")

        genomes = []
        if mito_bam:
            genomes.append("MT")
        if nuc_bam:
            genomes.append("Nuclear")

        print_info("  %i. %s: %s DNA" % (idx, name, ' and '.join(genomes)))

        nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))

    if config.multisample and not config.admixture_only:
        nodes = [summary.SummaryNode(config, nodes)]

    if not run_pipeline(config, nodes, "\nRunning Zonkey:"):
        return 1
예제 #9
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
예제 #10
0
파일: ui.py 프로젝트: muslih14/paleomix
    def finalize(self):
        """Called by the pipeline at the termination of a run. By default,
        this function prints the location of the log-file if one was created
        during the run (e.g. if there were errors), and a summary of all nodes.
        """
        runtime = (self._end_time or 0) - (self._start_time or 0)

        if self.states[self.ERROR]:
            print_err("Done; but errors were detected ...")
        else:
            print_info("Done ...")

        print_info()
        rows = [("  Number of nodes:", sum(self.states)),
                ("  Number of done nodes:", self.states[self.DONE]),
                ("  Number of runable nodes:", self.states[self.RUNABLE]),
                ("  Number of queued nodes:", self.states[self.QUEUED]),
                ("  Number of outdated nodes:", self.states[self.OUTDATED]),
                ("  Number of failed nodes:", self.states[self.ERROR]),
                ("  Pipeline runtime:", _fmt_runtime(round(runtime)))]

        for line in text.padded_table(rows):
            print_info(line)

        print_info("\nUse --list-output-files to view status of output files.")

        logfile = paleomix.logger.get_logfile()
        if logfile:
            print_debug("Log-file located at %r" % (logfile,))

        print_info()
예제 #11
0
def build_pipeline_full(config, makefile, return_nodes=True):
    result = []
    features = makefile["Options"]["Features"]
    for (target_name, sample_records) in makefile["Targets"].iteritems():
        print_info(".", end='')

        prefixes = []
        for (_, prefix) in makefile["Prefixes"].iteritems():
            samples = []
            for (sample_name, library_records) in sample_records.iteritems():
                libraries = []
                for (library_name,
                     barcode_records) in library_records.iteritems():
                    lanes = []
                    for (barcode, record) in barcode_records.iteritems():
                        lane = parts.Lane(config, prefix, record, barcode)

                        # ExcludeReads settings may exlude entire lanes
                        if lane.bams:
                            lanes.append(lane)

                    if lanes:
                        libraries.append(
                            parts.Library(config=config,
                                          target=target_name,
                                          prefix=prefix,
                                          lanes=lanes,
                                          name=library_name))

                if libraries:
                    samples.append(
                        parts.Sample(config=config,
                                     prefix=prefix,
                                     libraries=libraries,
                                     name=sample_name))

            if samples:
                prefixes.append(
                    parts.Prefix(config=config,
                                 prefix=prefix,
                                 samples=samples,
                                 features=features,
                                 target=target_name))

        if prefixes:
            target = parts.Target(config, prefixes, target_name)

            # Construct coverage, depth-histogram, and summary nodes, etc.
            parts.add_statistics_nodes(config, makefile, target)

            if return_nodes:
                # Extra tasks (e.g. coverage, depth-histograms, etc.)
                result.extend(target.nodes)
                # Output BAM files (raw, realigned)
                result.extend(target.bams.itervalues())
            else:
                result.append(target)

    return result
예제 #12
0
def read_makefiles(options, filenames, commands):
    print_info("Reading makefile(s):")
    steps = frozenset(key for (key, _) in commands)

    makefiles = []
    for filename in filenames:
        makefile = paleomix.common.makefile.read_makefile(filename, _VALIDATION)
        makefile = _mangle_makefile(options, makefile["Makefile"], steps)
        makefiles.append(makefile)
    return makefiles
예제 #13
0
def _collect_fasta_contigs(filename, cache={}):
    if filename in cache:
        return cache[filename]

    if not os.path.exists(filename + ".fai"):
        print_info("      - Index does not exist for %r; this may "
                   "take a while ..." % (filename,))

    cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename))
    return contigs
예제 #14
0
def read_makefiles(options, filenames, commands):
    print_info("Reading makefile(s):")
    steps = frozenset(key for (key, _) in commands)

    makefiles = []
    for filename in filenames:
        makefile = paleomix.common.makefile.read_makefile(filename, _VALIDATION)
        makefile = _mangle_makefile(options, makefile["Makefile"], steps)
        makefiles.append(makefile)
    return makefiles
예제 #15
0
def _collect_fasta_contigs(filename, cache={}):
    if filename in cache:
        return cache[filename]

    if not os.path.exists(filename + ".fai"):
        print_info("      - Index does not exist for %r; this may "
                   "take a while ..." % (filename,))

    cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename))
    return contigs
예제 #16
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path, ))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path, ))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error, ))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s" %
                                        (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files." %
                           (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
예제 #17
0
def _validate_prefixes(makefiles):
    """Validates prefixes and regions-of-interest, including an implementation
    of the checks included in GATK, which require that the FASTA for the human
    genome is ordered 1 .. 23. This is required since GATK will not run with
    human genomes in a different order.
    """
    already_validated = {}
    print_info("  - Validating prefixes ...")
    for makefile in makefiles:
        uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
        for prefix in makefile["Prefixes"].itervalues():
            path = prefix["Path"]
            if path in already_validated:
                prefix["IndexFormat"] = already_validated[path]["IndexFormat"]
                continue

            # Must be set to a valid value, even if FASTA file does not exist
            prefix["IndexFormat"] = ".bai"

            if not os.path.exists(path):
                print_warn("    - Reference FASTA file does not exist:\n"
                           "      %r" % (path,))
                continue
            elif not os.path.exists(path + ".fai"):
                print_info("    - Index does not exist for %r; this may "
                           "take a while ..." % (path,))

            try:
                contigs = FASTA.index_and_collect_contigs(path)
            except FASTAError, error:
                raise MakefileError("Error indexing FASTA:\n %s" % (error,))

            # Implementation of GATK checks for the human genome
            _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)

            contigs = dict(contigs)
            regions_of_interest = prefix.get("RegionsOfInterest", {})
            for (name, fpath) in regions_of_interest.iteritems():
                try:
                    # read_bed_file returns iterator
                    for _ in bedtools.read_bed_file(fpath, contigs=contigs):
                        pass
                except (bedtools.BEDError, IOError), error:
                    raise MakefileError("Error reading regions-of-"
                                        "interest %r for prefix %r:\n%s"
                                        % (name, prefix["Name"], error))

            if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH:
                print_warn("    - FASTA file %r contains sequences longer "
                           "than %i! CSI index files will be used instead "
                           "of BAI index files."
                           % (path, _BAM_MAX_SEQUENCE_LENGTH))
                prefix["IndexFormat"] = ".csi"

            already_validated[path] = prefix
예제 #18
0
def build_pipeline_full(config, makefile, return_nodes=True):
    result = []
    features = makefile["Options"]["Features"]
    for (target_name, sample_records) in makefile["Targets"].iteritems():
        print_info(".", end='')

        prefixes = []
        for (_, prefix) in makefile["Prefixes"].iteritems():
            samples = []
            for (sample_name, library_records) in sample_records.iteritems():
                libraries = []
                for (library_name, barcode_records) in library_records.iteritems():
                    lanes = []
                    for (barcode, record) in barcode_records.iteritems():
                        lane = parts.Lane(config, prefix, record, barcode)

                        # ExcludeReads settings may exlude entire lanes
                        if lane.bams:
                            lanes.append(lane)

                    if lanes:
                        libraries.append(parts.Library(config=config,
                                                       target=target_name,
                                                       prefix=prefix,
                                                       lanes=lanes,
                                                       name=library_name))

                if libraries:
                    samples.append(parts.Sample(config=config,
                                                prefix=prefix,
                                                libraries=libraries,
                                                name=sample_name))

            if samples:
                prefixes.append(parts.Prefix(config=config,
                                             prefix=prefix,
                                             samples=samples,
                                             features=features,
                                             target=target_name))

        if prefixes:
            target = parts.Target(config, prefixes, target_name)

            # Construct coverage, depth-histogram, and summary nodes, etc.
            parts.add_statistics_nodes(config, makefile, target)

            if return_nodes:
                # Extra tasks (e.g. coverage, depth-histograms, etc.)
                result.extend(target.nodes)
                # Output BAM files (raw, realigned)
                result.extend(target.bams.itervalues())
            else:
                result.append(target)

    return result
예제 #19
0
def _print_usage(pipeline):
    basename = "%s_pipeline" % (pipeline,)
    usage = \
        "BAM Pipeline v{version}\n" \
        "Usage:\n" \
        "  -- {cmd} help           -- Display this message.\n" \
        "  -- {cmd} example [...]  -- Create example project.\n" \
        "  -- {cmd} makefile [...] -- Print makefile template.\n" \
        "  -- {cmd} dryrun [...]   -- Perform dry run of pipeline.\n" \
        "  -- {cmd} run [...]      -- Run pipeline on provided makefiles.\n" \
        "  -- {cmd} remap [...]    -- Re-map hits from previous alignment."

    print_info(usage.format(version=paleomix.__version__,
                            cmd=basename,
                            pad=" " * len(basename)))
예제 #20
0
def _print_usage(pipeline):
    basename = "%s_pipeline" % (pipeline, )
    usage = \
        "BAM Pipeline v{version}\n" \
        "Usage:\n" \
        "  -- {cmd} help           -- Display this message.\n" \
        "  -- {cmd} example [...]  -- Create example project.\n" \
        "  -- {cmd} makefile [...] -- Print makefile template.\n" \
        "  -- {cmd} dryrun [...]   -- Perform dry run of pipeline.\n" \
        "  -- {cmd} run [...]      -- Run pipeline on provided makefiles.\n" \
        "  -- {cmd} remap [...]    -- Re-map hits from previous alignment."

    print_info(
        usage.format(version=paleomix.__version__,
                     cmd=basename,
                     pad=" " * len(basename)))
예제 #21
0
파일: mkfile.py 프로젝트: muslih14/paleomix
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    options, paths = parse_args(argv)
    records = {}
    for root in paths:
        if os.path.isdir(root):
            filename = os.path.join(root, _FILENAME)
        else:
            root, filename = os.path.split(root)[0], root

        if not os.path.exists(filename):
            print_err("ERROR: Could not find SampleSheet file: %r" % filename)
            return 1

        for record in read_alignment_records(filename):
            libraries = records.setdefault(record["SampleID"], {})
            barcodes = libraries.setdefault(record["Index"], [])

            record["Lane"] = int(record["Lane"])
            path = "%(SampleID)s_%(Index)s_L%(Lane)03i_R{Pair}_*.fastq.gz" \
                % record
            record["Path"] = select_path(os.path.join(root, path))
            barcodes.append(record)

    template = build_makefile(add_full_options=(pipeline == "bam"),
                              add_prefix_tmpl=(pipeline == "bam"))
    if options.minimal:
        template = strip_comments(template)

    print(template)

    for (sample, libraries) in records.iteritems():
        print("%s:" % sample)
        print("  %s:" % sample)
        for (library, barcodes) in libraries.iteritems():
            print("    %s:" % library)
            for record in barcodes:
                print("      {FCID}_{Lane}: {Path}".format(**record))
            print()
        print()

    if argv:
        print_info("Automatically generated makefile printed.\n"
                   "Please check for correctness before running pipeline.")
    return 0
예제 #22
0
def build_pipeline_trimming(config, makefile):
    """Builds only the nodes required to produce trimmed reads.
    This reduces the required complexity of the makefile to a minimum."""

    nodes = []
    for (_, samples) in makefile["Targets"].iteritems():
        print_info(".", end='')

        for libraries in samples.itervalues():
            for barcodes in libraries.itervalues():
                for record in barcodes.itervalues():
                    if record["Type"] in ("Raw", "Trimmed"):
                        offset = record["Options"]["QualityOffset"]
                        reads = Reads(config, record, offset)

                        nodes.extend(reads.nodes)

    return nodes
예제 #23
0
def build_pipeline_trimming(config, makefile):
    """Builds only the nodes required to produce trimmed reads.
    This reduces the required complexity of the makefile to a minimum."""

    nodes = []
    for (_, samples) in makefile["Targets"].iteritems():
        print_info(".", end='')

        for libraries in samples.itervalues():
            for barcodes in libraries.itervalues():
                for record in barcodes.itervalues():
                    if record["Type"] in ("Raw", "Trimmed"):
                        offset = record["Options"]["QualityOffset"]
                        reads = Reads(config, record, offset)

                        nodes.extend(reads.nodes)

    return nodes
예제 #24
0
def _update_regions(options, mkfile):
    print_info("    - Validating regions of interest ...")
    mkfile["Project"]["Regions"] = mkfile["Project"].pop("RegionsOfInterest")

    if not mkfile["Project"]["Regions"]:
        raise MakefileError('No regions of interest have been specified; '
                            'no analyses will be performed.')

    for (name, subdd) in mkfile["Project"]["Regions"].iteritems():
        if "Prefix" not in subdd:
            raise MakefileError("No genome specified for regions %r" % (name,))

        subdd["Name"]   = name
        subdd["Desc"]   = "{Prefix}.{Name}".format(**subdd)
        subdd["BED"]    = os.path.join(options.regions_root, subdd["Desc"] + ".bed")
        subdd["FASTA"]  = os.path.join(options.prefix_root, subdd["Prefix"] + ".fasta")

        required_files = (
            ("Regions file", subdd["BED"]),
            ("Reference sequence", subdd["FASTA"]),
        )

        for (desc, path) in required_files:
            if not os.path.isfile(path):
                raise MakefileError("%s does not exist for %r:\n  Path = %r"
                                    % (desc, name, path))

        # Collects seq. names / validate regions
        try:
            sequences = _collect_sequence_names(bed_file=subdd["BED"],
                                                fasta_file=subdd["FASTA"])
        except (IOError, BEDError), error:
            raise MakefileError("Error reading regions-of-interest %r:\n%s"
                                % (name, error))

        subdd["Sequences"] = {None: sequences}
        subdd["SubsetFiles"] = {None: ()}
        sampledd = subdd["Genotypes"] = {}
        for sample_name in mkfile["Project"]["Samples"]:
            fasta_file = ".".join((sample_name, subdd["Desc"], "fasta"))
            sampledd[sample_name] = os.path.join(options.destination,
                                                 mkfile["Project"]["Title"],
                                                 "genotypes",
                                                 fasta_file)
예제 #25
0
파일: ui.py 프로젝트: jelber2/paleomix
    def process_key_presses(self, nodegraph, max_threads, ui):
        if not self._tty_settings:
            return max_threads

        help_printed = False
        old_max_threads = max_threads
        while self.poll_stdin():
            character = sys.stdin.read(1)
            if character == "+":
                max_threads = min(multiprocessing.cpu_count(), max_threads + 1)
            elif character == "-":
                max_threads = max(1, max_threads - 1)
            elif character in "lL":
                print_info(file=sys.stdout)
                progress_printer = RunningUI()
                progress_printer.max_threads = max_threads
                progress_printer.start_time = ui.start_time
                progress_printer.refresh(nodegraph)
                progress_printer.flush()
            elif character in "hH":
                if help_printed:
                    continue

                help_printed = True
                print_info("""
Commands:
  Key   Function
  h     Prints this message.
  l     Lists the currently runnning nodes.
  +     Increases the maximum number of threads by one.
  -     Decreases the maximum number of threads by one; already running tasks
        are NOT terminated if the number of threads currently used exceeds the
        resulting maximum.
""",
                           file=sys.stdout)
            else:
                continue

        if max_threads != old_max_threads:
            print_debug("Maximum number of threads changed from %i to %i." %
                        (old_max_threads, max_threads),
                        file=sys.stdout)

        return max_threads
예제 #26
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(options.samples_root, "%s.%s.bam"
                                    % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions["FASTA"])

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = ("Reference sequence missing from BAM file; "
                               "BAM file aligned against different prefix?\n"
                               "    BAM file = %s\n    Sequence name = %s") \
                               % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = ("Length of reference sequence in FASTA differs "
                               "from length of sequence in BAM file; BAM file "
                               "aligned against different prefix?\n"
                               "    BAM file = %s\n"
                               "    Length in FASTA = %s\n"
                               "    Length in BAM = %s") \
                               % (filename, length, bam_length)
                    raise MakefileError(message)
예제 #27
0
def _check_bam_sequences(options, mkfile, steps):
    """Check that the BAM files contains the reference sequences found in the
    FASTA file, matched by name and length; extra sequences are permitted. This
    check is only done if genotyping is to be carried out, to reduce the
    overhead of reading the BAM file headers.

    """
    if ("genotype" not in steps) and ("genotyping" not in steps):
        return

    print_info("    - Validating BAM files ...")
    bam_files = {}
    for regions in mkfile["Project"]["Regions"].itervalues():
        for sample in mkfile["Project"]["Samples"].itervalues():
            filename = os.path.join(options.samples_root, "%s.%s.bam"
                                    % (sample["Name"], regions["Prefix"]))
            if regions["Realigned"]:
                filename = add_postfix(filename, ".realigned")

            if os.path.exists(filename):
                bam_files[filename] = _collect_fasta_contigs(regions["FASTA"])

    for (filename, contigs) in bam_files.iteritems():
        with pysam.Samfile(filename) as handle:
            bam_contigs = dict(zip(handle.references, handle.lengths))

            for (contig, length) in contigs.iteritems():
                bam_length = bam_contigs.get(contig)

                if bam_length is None:
                    message = ("Reference sequence missing from BAM file; "
                               "BAM file aligned against different prefix?\n"
                               "    BAM file = %s\n    Sequence name = %s") \
                               % (filename, contig)
                    raise MakefileError(message)
                elif bam_length != length:
                    message = ("Length of reference sequence in FASTA differs "
                               "from length of sequence in BAM file; BAM file "
                               "aligned against different prefix?\n"
                               "    BAM file = %s\n"
                               "    Length in FASTA = %s\n"
                               "    Length in BAM = %s") \
                               % (filename, length, bam_length)
                    raise MakefileError(message)
예제 #28
0
파일: config.py 프로젝트: jelber2/paleomix
    def _write_config_file(self, config, defaults):
        """Writes a basic config files, using the values previously found in the
        config files, and specified on the command-line."""
        defaults_cfg = ConfigParser.SafeConfigParser()
        defaults_cfg.add_section("Defaults")
        for key in defaults:
            value = getattr(config, key)
            if isinstance(value, (types.ListType, types.TupleType)):
                value = ";".join(value)

            defaults_cfg.set("Defaults", key, str(value))

        filename = self._filenames[-1]
        make_dirs(os.path.dirname(filename))
        with open(filename, "w") as handle:
            defaults_cfg.write(handle)

        print_info("Wrote config file %r" % (filename, ))
        sys.exit(0)
예제 #29
0
    def _write_config_file(self, config, defaults):
        """Writes a basic config files, using the values previously found in the
        config files, and specified on the command-line."""
        defaults_cfg = ConfigParser.SafeConfigParser()
        defaults_cfg.add_section("Defaults")
        for key in defaults:
            value = getattr(config, key)
            if isinstance(value, (types.ListType, types.TupleType)):
                value = ";".join(value)

            defaults_cfg.set("Defaults", key, str(value))

        filename = self._filenames[-1]
        make_dirs(os.path.dirname(filename))
        with open(filename, "w") as handle:
            defaults_cfg.write(handle)

        print_info("Wrote config file %r" % (filename,))
        sys.exit(0)
예제 #30
0
    def process_key_presses(self, nodegraph, max_threads, ui):
        if not self._tty_settings:
            return max_threads

        help_printed = False
        old_max_threads = max_threads
        while self.poll_stdin():
            character = sys.stdin.read(1)
            if character == "+":
                max_threads = min(multiprocessing.cpu_count(), max_threads + 1)
            elif character == "-":
                max_threads = max(1, max_threads - 1)
            elif character in "lL":
                print_info(file=sys.stdout)
                progress_printer = RunningUI()
                progress_printer.max_threads = max_threads
                progress_printer.start_time = ui.start_time
                progress_printer.refresh(nodegraph)
                progress_printer.flush()
            elif character in "hH":
                if help_printed:
                    continue

                help_printed = True
                print_info("""
Commands:
  Key   Function
  h     Prints this message.
  l     Lists the currently runnning nodes.
  +     Increases the maximum number of threads by one.
  -     Decreases the maximum number of threads by one; already running tasks
        are NOT terminated if the number of threads currently used exceeds the
        resulting maximum.
""", file=sys.stdout)
            else:
                continue

        if max_threads != old_max_threads:
            print_debug("Maximum number of threads changed from %i to %i."
                        % (old_max_threads, max_threads), file=sys.stdout)

        return max_threads
예제 #31
0
파일: config.py 프로젝트: muslih14/paleomix
def parse_config(argv):
    migrate_config()

    options, args = _run_config_parser(argv)
    paleomix.ui.set_ui_colors(options.ui_colors)

    if args and args[0] in ("example", "examples"):
        return options, args
    elif (len(args) < 2) and (args != ["mkfile"] and args != ["makefile"]):
        description = _DESCRIPTION.replace("%prog", "phylo_pipeline").strip()
        console.print_info("Phylogeny Pipeline v%s\n" % (paleomix.__version__,))
        console.print_info(description)
        return options, args

    commands = select_commands(args[0] if args else ())
    if any((func is None) for (_, func) in commands):
        unknown_commands = ", ".join(repr(key) for (key, func) in commands if func is None)
        raise ConfigError("Unknown analysis step(s): %s" % (unknown_commands,))

    return options, args
예제 #32
0
def _update_regions(options, mkfile):
    print_info("    - Validating regions of interest ...")
    mkfile["Project"]["Regions"] = mkfile["Project"].pop("RegionsOfInterest")

    for (name, subdd) in mkfile["Project"]["Regions"].iteritems():
        if "Prefix" not in subdd:
            raise MakefileError("No genome specified for regions %r" % (name,))

        subdd["Name"]   = name
        subdd["Desc"]   = "{Prefix}.{Name}".format(**subdd)
        subdd["BED"]    = os.path.join(options.regions_root, subdd["Desc"] + ".bed")
        subdd["FASTA"]  = os.path.join(options.prefix_root, subdd["Prefix"] + ".fasta")

        required_files = (
            ("Regions file", subdd["BED"]),
            ("Reference sequence", subdd["FASTA"]),
        )

        for (desc, path) in required_files:
            if not os.path.isfile(path):
                raise MakefileError("%s does not exist for %r:\n  Path = %r"
                                    % (desc, name, path))

        # Collects seq. names / validate regions
        try:
            sequences = _collect_sequence_names(bed_file=subdd["BED"],
                                                fasta_file=subdd["FASTA"])
        except (IOError, BEDError), error:
            raise MakefileError("Error reading regions-of-interest %r:\n%s"
                                % (name, error))

        subdd["Sequences"] = {None: sequences}
        subdd["SubsetFiles"] = {None: ()}
        sampledd = subdd["Genotypes"] = {}
        for sample_name in mkfile["Project"]["Samples"]:
            fasta_file = ".".join((sample_name, subdd["Desc"], "fasta"))
            sampledd[sample_name] = os.path.join(options.destination,
                                                 mkfile["Project"]["Title"],
                                                 "genotypes",
                                                 fasta_file)
예제 #33
0
파일: mkfile.py 프로젝트: jelber2/paleomix
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    options, filenames = parse_args(argv)
    records = read_sample_sheets(filenames)
    if records is None:
        return 1

    template = build_makefile(add_full_options=(pipeline == "bam"),
                              add_prefix_tmpl=(pipeline == "bam"),
                              add_sample_tmpl=not records)
    if options.minimal:
        template = strip_comments(template)

    print(template)

    print_samples(records)

    if argv:
        print_info("Automatically generated makefile printed.\n"
                   "Please check for correctness before running pipeline.")
    return 0
예제 #34
0
def main(argv, pipeline="bam"):
    assert pipeline in ("bam", "trim"), pipeline

    options, filenames = parse_args(argv)
    records = read_sample_sheets(filenames)
    if records is None:
        return 1

    template = build_makefile(add_full_options=(pipeline == "bam"),
                              add_prefix_tmpl=(pipeline == "bam"),
                              add_sample_tmpl=not records)
    if options.minimal:
        template = strip_comments(template)

    print(template)

    print_samples(records)

    if argv:
        print_info("Automatically generated makefile printed.\n"
                   "Please check for correctness before running pipeline.")
    return 0
예제 #35
0
    if max_depth is None:
        raise MakefileError("MaxDepth for %r not found in depth-histogram: %r"
                            % (sample, filename))
    elif max_depth == "NA":
        raise MakefileError("MaxDepth is not calculated for sample %r; "
                            "cannot determine MaxDepth values automatically."
                            % (filename,))
    elif not max_depth.isdigit():
        raise MakefileError("MaxDepth is not a valid for sample %r in %r; "
                            "expected integer, found %r."
                            % (sample, filename, max_depth))

    max_depth = int(max_depth)

    print_info("        - %s.%s = %i" % (sample, prefix, max_depth))
    _DEPTHS_CACHE[filename] = max_depth
    return max_depth


_DEPTHS_CACHE = {}


def _check_indels_and_msa(mkfile):
    msa     = mkfile["MultipleSequenceAlignment"]
    regions = mkfile["Project"]["Regions"]
    for (name, subdd) in regions.iteritems():
        msa_enabled = msa[name]["Enabled"]

        if subdd["IncludeIndels"] and not msa_enabled:
            raise MakefileError("Regions %r includes indels, but MSA is disabled!" % (name,))
예제 #36
0
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error, ))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'" %
                  (config.temp_root, ))
        return 1

    # Init worker-threads before reading in any more data
    pipeline = Pypeline(config)

    try:
        print_info("Reading makefiles ...")
        makefiles = read_makefiles(config, args, pipeline_variant)
    except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
        print_err("Error reading makefiles:",
                  "\n  %s:\n   " % (error.__class__.__name__, ),
                  "\n    ".join(str(error).split("\n")))
        return 1

    logfile_template = time.strftime("bam_pipeline.%Y%m%d_%H%M%S_%%02i.log")
    paleomix.logger.initialize(config, logfile_template)
    logger = logging.getLogger(__name__)

    pipeline_func = build_pipeline_trimming
    if pipeline_variant == "bam":
        # Build .fai files for reference .fasta files
        index_references(config, makefiles)
예제 #37
0
    if max_depth is None:
        raise MakefileError("MaxDepth for %r not found in depth-histogram: %r"
                            % (sample, filename))
    elif max_depth == "NA":
        raise MakefileError("MaxDepth is not calculated for sample %r; "
                            "cannot determine MaxDepth values automatically."
                            % (filename,))
    elif not max_depth.isdigit():
        raise MakefileError("MaxDepth is not a valid for sample %r in %r; "
                            "expected integer, found %r."
                            % (sample, filename, max_depth))

    max_depth = int(max_depth)

    print_info("        - %s.%s = %i" % (sample, prefix, max_depth))
    _DEPTHS_CACHE[filename] = max_depth
    return max_depth


_DEPTHS_CACHE = {}


def _check_indels_and_msa(mkfile):
    msa     = mkfile["MultipleSequenceAlignment"]
    regions = mkfile["Project"]["Regions"]
    for (name, subdd) in regions.iteritems():
        msa_enabled = msa[name]["Enabled"]

        if subdd["IncludeIndels"] and not msa_enabled:
            raise MakefileError("Regions %r includes indels, but MSA is disabled!" % (name,))
예제 #38
0
        try:
            os.makedirs(config.temp_root)
        except OSError, error:
            print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
            return 1

    if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
        print_err("ERROR: Insufficient permissions for temp root: '%s'"
                  % (config.temp_root,))
        return 1

    # Init worker-threads before reading in any more data
    pipeline = Pypeline(config)

    try:
        print_info("Reading makefiles ...")
        makefiles = read_makefiles(config, args, pipeline_variant)
    except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
        print_err("Error reading makefiles:",
                  "\n  %s:\n   " % (error.__class__.__name__,),
                  "\n    ".join(str(error).split("\n")))
        return 1

    logfile_template = time.strftime("bam_pipeline.%Y%m%d_%H%M%S_%%02i.log")
    paleomix.logger.initialize(config, logfile_template)
    logger = logging.getLogger(__name__)

    pipeline_func = build_pipeline_trimming
    if pipeline_variant == "bam":
        # Build .fai files for reference .fasta files
        index_references(config, makefiles)