Exemplo n.º 1
0
 def _run(self, _config, _temp):
     for filename in self.input_files:
         check_fasta_file(filename)
     output_file, = self.output_files
     make_dirs(os.path.dirname(output_file))
     with open(output_file, "w"):
         pass
Exemplo n.º 2
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
Exemplo n.º 3
0
    def run(self, _):
        handles = []
        try:
            last_pos = None
            observed_reads = collections.defaultdict(list)
            for (record, filename) in self._open_samfiles(handles, self.input_files):
                curr_pos = (record.pos, record.tid)
                if curr_pos != last_pos:
                    self._process_reads(observed_reads, self.output_files)
                    observed_reads.clear()
                    last_pos = curr_pos

                    # Stop once the trailing, unmapped reads are reached
                    if record.tid == -1:
                        break

                observed_reads[record.qname].append((record, filename))
            self._process_reads(observed_reads, self.output_files)

            # Everything is ok, touch the output files
            for fpath in self.output_files:
                make_dirs(os.path.dirname(fpath))
                with open(fpath, "w"):
                    pass
        finally:
            for handle in handles:
                handle.close()
Exemplo n.º 4
0
def run_admix_pipeline(config):
    print_info("\nBuilding %i Zonkey pipeline(s):" % (len(config.samples),))
    config.temp_root = os.path.join(config.destination, "temp")
    if not config.dry_run:
        fileutils.make_dirs(config.temp_root)

    cache = {}
    nodes = []
    items = config.samples.iteritems()
    for idx, (name, sample) in enumerate(sorted(items), start=1):
        root = sample["Root"]
        nuc_bam = sample["Files"].get("Nuc")
        mito_bam = sample["Files"].get("Mito")

        genomes = []
        if mito_bam:
            genomes.append("MT")
        if nuc_bam:
            genomes.append("Nuclear")

        print_info("  %i. %s: %s DNA" % (idx, name, ' and '.join(genomes)))

        nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))

    if config.multisample and not config.admixture_only:
        nodes = [summary.SummaryNode(config, nodes)]

    if not run_pipeline(config, nodes, "\nRunning Zonkey:"):
        return 1
Exemplo n.º 5
0
 def _create_temp_dir(self, _config):
     """Called by 'run' in order to create a temporary folder.
     To allow restarting from checkpoints, we use a fixed folder
     determined by the output_template."""
     temp = os.path.join(self._dirname, self._template % ("temp",))
     fileutils.make_dirs(temp)
     return temp
Exemplo n.º 6
0
 def _create_temp_dir(self, _config):
     """Called by 'run' in order to create a temporary folder.
     To allow restarting from checkpoints, we use a fixed folder
     determined by the output_template."""
     temp = os.path.join(self._dirname, self._template % ("temp", ))
     fileutils.make_dirs(temp)
     return temp
Exemplo n.º 7
0
def run_admix_pipeline(config):
    log = logging.getLogger(__name__)
    log.info("Building %i Zonkey pipeline(s):", len(config.samples))
    config.temp_root = os.path.join(config.destination, "temp")
    if not config.dry_run:
        fileutils.make_dirs(config.temp_root)

    cache = {}
    nodes = []
    items = iter(config.samples.items())
    for idx, (name, sample) in enumerate(sorted(items), start=1):
        root = sample["Root"]
        nuc_bam = sample["Files"].get("Nuc")
        mito_bam = sample["Files"].get("Mito")

        genomes = []
        if mito_bam:
            genomes.append("MT")
        if nuc_bam:
            genomes.append("Nuclear")

        log.info("  %i. %s: %s DNA", idx, name, " and ".join(genomes))

        nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))

    if config.multisample and not config.admixture_only:
        nodes = [summary.SummaryNode(config, nodes)]

    if not run_pipeline(config, nodes, "Running Zonkey"):
        return 1
Exemplo n.º 8
0
def setup_example(config):
    root = os.path.join(config.destination, 'zonkey_pipeline')

    with tarfile.TarFile(config.tablefile) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == 'examples' and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            print_err("Output files already exist at destination:\n    - %s"
                      % ("\n    - ".join(map(repr, existing_files))))
            return 1
        elif not example_files:
            print_err("Sample database %r does not contain example data; "
                      "cannot proceed." % (config.tablefile,))
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, 'w') as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    print_info("Sucessfully saved example data in %r" % (root,))

    return 0
Exemplo n.º 9
0
def process_bam(args, data, bam_handle):
    raw_references = bam_handle.references
    references = map(common.contig_name_to_plink_name, raw_references)

    if args.downsample:
        sys.stderr.write("Downsampling to at most %i BAM records ...\n"
                         % (args.downsample))
        bam_handle = DownsampledBAM(bam_handle, args.downsample, references)

    statistics = {"n_reads": 0,
                  "n_reads_used": 0,
                  "n_sites_incl_ts": 0,
                  "n_sites_excl_ts": 0}

    fileutils.make_dirs(args.root)

    with open(os.path.join(args.root, 'incl_ts.tped'), 'w') as output_incl:
        with open(os.path.join(args.root, 'excl_ts.tped'), 'w') as output_excl:
            with GenotypeReader(args.data) as reader:
                for ref, sites in reader:
                    raw_ref = raw_references[references.index(ref)]

                    sys.stderr.write("Reading %r from BAM ...\n" % (raw_ref,))
                    raw_sites = bam_handle.fetch(raw_ref)
                    for pos, line, nucleotides in sites.process(raw_sites,
                                                                statistics):
                        process_record(ref, pos, line, nucleotides,
                                       out_incl_ts=output_incl,
                                       out_excl_ts=output_excl,
                                       statistics=statistics)

                write_summary(args, os.path.join(args.root, "common.summary"),
                              statistics=statistics)
                write_tfam(os.path.join(args.root, "common.tfam"),
                           data, reader.samples, args.name)
Exemplo n.º 10
0
 def _run(self, _config, _temp):
     for filename in self.input_files:
         check_fasta_file(filename)
     (output_file, ) = self.output_files
     if os.path.dirname(output_file):
         make_dirs(os.path.dirname(output_file))
     with open(output_file, "w"):
         pass
Exemplo n.º 11
0
    def _teardown(self, config, temp):
        fileutils.make_dirs(self._root)

        fileutils.move_file(os.path.join(temp, "report.html"),
                            os.path.join(self._root, "report.html"))

        css_path = paleomix.resources.report("zonkey", "report.css")
        fileutils.copy_file(css_path, os.path.join(self._root, "report.css"))
Exemplo n.º 12
0
    def _teardown(self, config, temp):
        fileutils.make_dirs(self._root)

        fileutils.move_file(os.path.join(temp, "summary.html"),
                            os.path.join(self._root, "summary.html"))

        css_path = paleomix.resources.report("zonkey", "report.css")
        fileutils.copy_file(css_path, os.path.join(self._root, "summary.css"))
Exemplo n.º 13
0
    def _run(self, _config, _temp):
        stats = check_fastq_files(self._files, self._offset, True)
        output_file = tuple(self.output_files)[0]
        if os.path.dirname(output_file):
            make_dirs(os.path.dirname(output_file))

        data = json.dumps(stats)
        with open(output_file, "w") as handle:
            handle.write(data)
Exemplo n.º 14
0
    def _run(self, _config, _temp):
        stats = check_fastq_files(self._files, self._offset, True)
        output_file = tuple(self.output_files)[0]
        if os.path.dirname(output_file):
            make_dirs(os.path.dirname(output_file))

        data = json.dumps(stats)
        with open(output_file, "w") as handle:
            handle.write(data)
Exemplo n.º 15
0
def test_move_file__move_to_existing_folder(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "src"))
    assert make_dirs(os.path.join(temp_folder, "dst"))
    file_1 = os.path.join(temp_folder, "src", "file_1")
    file_2 = os.path.join(temp_folder, "dst", "file_2")
    set_file_contents(file_1, "2")
    move_file(file_1, file_2)
    assert_equal(os.listdir(os.path.dirname(file_1)), [])
    assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
    assert_equal(get_file_contents(file_2), "2")
Exemplo n.º 16
0
    def run(self, _):
        check_bam_files(self.input_files, self._throw_node_error)

        # Everything is ok, touch the output files
        for fpath in self.output_files:
            if os.path.dirname(fpath):
                make_dirs(os.path.dirname(fpath))

            with open(fpath, "w"):
                pass
Exemplo n.º 17
0
    def run(self, _):
        check_bam_files(self.input_files, self._throw_node_error)

        # Everything is ok, touch the output files
        for fpath in self.output_files:
            if os.path.dirname(fpath):
                make_dirs(os.path.dirname(fpath))

            with open(fpath, "w"):
                pass
Exemplo n.º 18
0
def test_move_file__move_to_existing_folder(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "src"))
    assert make_dirs(os.path.join(temp_folder, "dst"))
    file_1 = os.path.join(temp_folder, "src", "file_1")
    file_2 = os.path.join(temp_folder, "dst", "file_2")
    set_file_contents(file_1, "2")
    move_file(file_1, file_2)
    assert_equal(os.listdir(os.path.dirname(file_1)), [])
    assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
    assert_equal(get_file_contents(file_2), "2")
Exemplo n.º 19
0
def process_bam(args, data, bam_handle, mapping):
    reverse_mapping = dict(zip(mapping.values(), mapping))
    raw_references = bam_handle.references
    references = [reverse_mapping.get(name, name) for name in raw_references]

    if args.downsample:
        sys.stderr.write("Downsampling to at most %i BAM records\n" %
                         (args.downsample))
        bam_handle = DownsampledBAM(bam_handle, args.downsample, references)

    statistics = {
        "n_reads": 0,
        "n_reads_used": 0,
        "n_sites_incl_ts": 0,
        "n_sites_excl_ts": 0,
    }

    fileutils.make_dirs(args.root)

    with open(os.path.join(args.root, "incl_ts.tped"), "w") as output_incl:
        with open(os.path.join(args.root, "excl_ts.tped"), "w") as output_excl:
            with GenotypeReader(args.database) as reader:
                for ref, sites in reader:
                    records = set()
                    raw_ref = raw_references[references.index(ref)]

                    sys.stderr.write("Reading %r from BAM\n" % (raw_ref, ))
                    raw_sites = bam_handle.fetch(raw_ref)
                    for pos, line, nucleotides in sites.process(
                            raw_sites, statistics):
                        process_record(
                            ref,
                            pos,
                            line,
                            nucleotides,
                            out_incl_ts=output_incl,
                            out_excl_ts=output_excl,
                            statistics=statistics,
                            records=records,
                        )

                write_summary(
                    args,
                    os.path.join(args.root, "common.summary"),
                    statistics=statistics,
                )
                write_tfam(
                    os.path.join(args.root, "common.tfam"),
                    data,
                    reader.samples,
                    args.name,
                )
Exemplo n.º 20
0
def _open_logfile(folder, template, start=0):
    """Try to open a new logfile, taking steps to ensure that
    existing logfiles using the same template are not clobbered."""
    if not os.path.exists(folder):
        _fs.make_dirs(folder)

    flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
    while True:
        filename = os.path.join(folder, template % (start, ))
        try:
            if not os.path.exists(filename):
                return filename, os.fdopen(os.open(filename, flags), "w")
        except OSError, error:
            if error.errno != errno.EEXIST:
                raise
        start += 1
Exemplo n.º 21
0
def main(argv):
    args = parse_args(argv)
    args.revision = datetime.datetime.today().strftime('%Y%m%d')

    data = _collect_samples(args.reference, args.samples)
    if not data:
        return 1

    fileutils.make_dirs(args.root)

    _write_contigs(args, os.path.join(args.root, 'contigs.txt'))
    _write_samples(args, data['samples'],
                   os.path.join(args.root, 'samples.txt'))
    _write_settings(args, data['contigs'],
                    os.path.join(args.root, 'settings.yaml'))
    _write_genotypes(args, data, os.path.join(args.root, 'genotypes.txt'))
    _write_build_sh(args, os.path.join(args.root, 'build.sh'))
Exemplo n.º 22
0
def main(argv):
    args = parse_args(argv)
    args.revision = datetime.datetime.today().strftime("%Y%m%d")

    data = _collect_samples(args.reference, args.samples)
    if not data:
        return 1

    fileutils.make_dirs(args.root)

    _write_contigs(args, os.path.join(args.root, "contigs.txt"))
    _write_samples(args, data["samples"], os.path.join(args.root,
                                                       "samples.txt"))
    _write_settings(args, data["contigs"],
                    os.path.join(args.root, "settings.yaml"))
    _write_genotypes(args, data, os.path.join(args.root, "genotypes.txt"))
    _write_build_sh(args, os.path.join(args.root, "build.sh"))
Exemplo n.º 23
0
def test_copy_file__copy_to_new_folder(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "src"))
    file_1 = os.path.join(temp_folder, "src", "file_1")
    file_2 = os.path.join(temp_folder, "dst", "file_2")
    set_file_contents(file_1, "2")
    copy_file(file_1, file_2)
    assert_equal(os.listdir(os.path.dirname(file_1)), ["file_1"])
    assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
    assert_equal(get_file_contents(file_1), "2")
    assert_equal(get_file_contents(file_2), "2")
Exemplo n.º 24
0
def process_bam(args, data, bam_handle):
    raw_references = bam_handle.references
    references = map(common.contig_name_to_plink_name, raw_references)

    if args.downsample:
        sys.stderr.write("Downsampling to at most %i BAM records ...\n" %
                         (args.downsample))
        bam_handle = DownsampledBAM(bam_handle, args.downsample, references)

    statistics = {
        "n_reads": 0,
        "n_reads_used": 0,
        "n_sites_incl_ts": 0,
        "n_sites_excl_ts": 0
    }

    fileutils.make_dirs(args.root)

    with open(os.path.join(args.root, 'incl_ts.tped'), 'w') as output_incl:
        with open(os.path.join(args.root, 'excl_ts.tped'), 'w') as output_excl:
            with GenotypeReader(args.database) as reader:
                for ref, sites in reader:
                    records = set()
                    raw_ref = raw_references[references.index(ref)]

                    sys.stderr.write("Reading %r from BAM ...\n" % (raw_ref, ))
                    raw_sites = bam_handle.fetch(raw_ref)
                    for pos, line, nucleotides in sites.process(
                            raw_sites, statistics):
                        process_record(ref,
                                       pos,
                                       line,
                                       nucleotides,
                                       out_incl_ts=output_incl,
                                       out_excl_ts=output_excl,
                                       statistics=statistics,
                                       records=records)

                write_summary(args,
                              os.path.join(args.root, "common.summary"),
                              statistics=statistics)
                write_tfam(os.path.join(args.root, "common.tfam"), data,
                           reader.samples, args.name)
Exemplo n.º 25
0
def run_admix_pipeline(config):
    config.temp_root = os.path.join(config.destination, "temp")
    if not config.dry_run:
        fileutils.make_dirs(config.temp_root)

    cache = {}
    nodes = []
    for sample in config.samples.itervalues():
        root = sample["Root"]
        nuc_bam = sample["Files"].get("Nuc")
        mito_bam = sample["Files"].get("Mito")

        nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))

    if config.multisample and not config.admixture_only:
        nodes = [summary.SummaryNode(config, nodes)]

    if not run_pipeline(config, nodes, "\nRunning Zonkey ..."):
        return 1
Exemplo n.º 26
0
def test_copy_file__copy_to_new_folder(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "src"))
    file_1 = os.path.join(temp_folder, "src", "file_1")
    file_2 = os.path.join(temp_folder, "dst", "file_2")
    set_file_contents(file_1, "2")
    copy_file(file_1, file_2)
    assert_equal(os.listdir(os.path.dirname(file_1)), ["file_1"])
    assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
    assert_equal(get_file_contents(file_1), "2")
    assert_equal(get_file_contents(file_2), "2")
Exemplo n.º 27
0
    def _write_config_file(self, config, defaults):
        """Writes a basic config files, using the values previously found in the
        config files, and specified on the command-line."""
        defaults_cfg = ConfigParser.SafeConfigParser()
        defaults_cfg.add_section("Defaults")
        for key in defaults:
            value = getattr(config, key)
            if isinstance(value, (types.ListType, types.TupleType)):
                value = ";".join(value)

            defaults_cfg.set("Defaults", key, str(value))

        filename = self._filenames[-1]
        make_dirs(os.path.dirname(filename))
        with open(filename, "w") as handle:
            defaults_cfg.write(handle)

        print_info("Wrote config file %r" % (filename,))
        sys.exit(0)
Exemplo n.º 28
0
    def _write_config_file(self, config, defaults):
        """Writes a basic config files, using the values previously found in the
        config files, and specified on the command-line."""
        defaults_cfg = ConfigParser.SafeConfigParser()
        defaults_cfg.add_section("Defaults")
        for key in defaults:
            value = getattr(config, key)
            if isinstance(value, (types.ListType, types.TupleType)):
                value = ";".join(value)

            defaults_cfg.set("Defaults", key, str(value))

        filename = self._filenames[-1]
        make_dirs(os.path.dirname(filename))
        with open(filename, "w") as handle:
            defaults_cfg.write(handle)

        print_info("Wrote config file %r" % (filename, ))
        sys.exit(0)
Exemplo n.º 29
0
def with_temp_folder(func):
    """Decorator for unit-tests:
    Creates a unique temporary folder before running 'func'. The
    function is is assumed to take at least one parameter, the first
    of which is assumed to represent the temporary folder."""
    temp_root = os.path.join(tempfile.gettempdir(), os.getlogin())
    make_dirs(temp_root) # Ensure that this subdirectory exists

    @nose.tools.istest
    def _wrapper(*args, **kwargs):
        try:
            temp_folder = None
            temp_folder = tempfile.mkdtemp(dir    = temp_root,
                                           prefix = "paleomix_unit")
            func(temp_folder, *args, **kwargs)
        finally:
            if temp_folder:
                shutil.rmtree(temp_folder)
    _wrapper.__name__ = func.__name__ + "__wrapped_by_with_temp_folder"
    return _wrapper
Exemplo n.º 30
0
def setup_example(config):
    root = os.path.join(config.destination, "zonkey_pipeline")
    log = logging.getLogger(__name__)
    log.info("Copying example project to %r", root)

    with tarfile.TarFile(config.database.filename) as tar_handle:
        example_files = []
        existing_files = []
        for member in tar_handle.getmembers():
            if os.path.dirname(member.name) == "examples" and member.isfile():
                example_files.append(member)

                destination = fileutils.reroot_path(root, member.name)
                if os.path.exists(destination):
                    existing_files.append(destination)

        if existing_files:
            log.error("Output files already exist at destination:")
            for filename in sorted(existing_files):
                log.error(" - %r", filename)
            return 1
        elif not example_files:
            log.error(
                "Sample database %r does not contain example data; cannot proceed.",
                config.database.filename,
            )
            return 1

        if not os.path.exists(root):
            fileutils.make_dirs(root)

        for member in example_files:
            destination = fileutils.reroot_path(root, member.name)
            src_handle = tar_handle.extractfile(member)
            with open(destination, "wb") as out_handle:
                shutil.copyfileobj(src_handle, out_handle)

    log.info("Sucessfully saved example data in %r", root)

    return 0
Exemplo n.º 31
0
def with_temp_folder(func):
    """Decorator for unit-tests:
    Creates a unique temporary folder before running 'func'. The
    function is is assumed to take at least one parameter, the first
    of which is assumed to represent the temporary folder."""
    name = pwd.getpwuid(os.geteuid()).pw_name
    temp_root = os.path.join(tempfile.gettempdir(), name)
    make_dirs(temp_root)

    @nose.tools.istest
    def _wrapper(*args, **kwargs):
        try:
            temp_folder = None
            temp_folder = tempfile.mkdtemp(dir=temp_root,
                                           prefix="paleomix_unit")
            func(temp_folder, *args, **kwargs)
        finally:
            if temp_folder:
                shutil.rmtree(temp_folder)

    _wrapper.__name__ = func.__name__ + "__wrapped_by_with_temp_folder"
    return _wrapper
Exemplo n.º 32
0
def test_move_dirs__permission_denied(temp_folder):
    dst_folder = os.path.join(temp_folder, "dst")
    file_1 = os.path.join(temp_folder, "file")
    file_2 = os.path.join(dst_folder, "file")
    set_file_contents(file_1, "1")

    # Make destination folder read-only
    assert make_dirs(os.path.join(temp_folder, "dst"))
    mode = os.stat(dst_folder).st_mode
    ro_mode = mode & ~(stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
    os.chmod(dst_folder, ro_mode)

    # Non ENOENT errors should be re-raised:
    assert_raises(IOError, move_file, file_1, file_2)
Exemplo n.º 33
0
def test_move_dirs__permission_denied(temp_folder):
    dst_folder = os.path.join(temp_folder, "dst")
    file_1 = os.path.join(temp_folder, "file")
    file_2 = os.path.join(dst_folder, "file")
    set_file_contents(file_1, "1")

    # Make destination folder read-only
    assert make_dirs(os.path.join(temp_folder, "dst"))
    mode = os.stat(dst_folder).st_mode
    ro_mode = mode & ~(stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
    os.chmod(dst_folder, ro_mode)

    # Non ENOENT errors should be re-raised:
    assert_raises(IOError, move_file, file_1, file_2)
Exemplo n.º 34
0
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination, filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Singleton")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key, PEReadSink.open(config.destination,
                                                        destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:
                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
Exemplo n.º 35
0
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination, filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Singleton")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key, PEReadSink.open(config.destination,
                                                        destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:
                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
Exemplo n.º 36
0
def test_make_dirs__creation_preemted(temp_folder):
    unwrapped, preempted = os.makedirs, []

    def _wrap_os_makedirs(*args, **kwargs):
        # Simulate somebody else creating the directory first
        preempted.append(True)
        unwrapped(*args, **kwargs)
        unwrapped(*args, **kwargs)

    with Monkeypatch("os.makedirs", _wrap_os_makedirs):
        work_folder = os.path.join(temp_folder, "test")
        assert not make_dirs(work_folder)
        assert os.path.exists(work_folder)
        assert_equal(os.listdir(temp_folder), ["test"])
        assert_equal(preempted, [True])
Exemplo n.º 37
0
def test_make_dirs__creation_preemted(temp_folder):
    unwrapped, preempted = os.makedirs, []

    def _wrap_os_makedirs(*args, **kwargs):
        # Simulate somebody else creating the directory first
        preempted.append(True)
        unwrapped(*args, **kwargs)
        unwrapped(*args, **kwargs)

    with Monkeypatch("os.makedirs", _wrap_os_makedirs):
        work_folder = os.path.join(temp_folder, "test")
        assert not make_dirs(work_folder)
        assert os.path.exists(work_folder)
        assert_equal(os.listdir(temp_folder), ["test"])
        assert_equal(preempted, [True])
Exemplo n.º 38
0
def test_make_dirs__sub_directories(temp_folder):
    assert not os.listdir(temp_folder)
    assert make_dirs(os.path.join(temp_folder, "test", "123"))
    assert_equal(os.listdir(temp_folder), ["test"])
    assert_equal(os.listdir(os.path.join(temp_folder, "test")), ["123"])
Exemplo n.º 39
0
def test_make_dirs__permissions(temp_folder):
    work_dir = os.path.join(temp_folder, "test_1")
    assert make_dirs(work_dir, mode=0511)
    stats = os.stat(work_dir)
    assert_equal(oct(stats.st_mode & 0777), oct(0511))
Exemplo n.º 40
0
def test_make_dirs__permissions(temp_folder):
    work_dir = os.path.join(temp_folder, "test_1")
    assert make_dirs(work_dir, mode=0511)
    stats = os.stat(work_dir)
    assert_equal(oct(stats.st_mode & 0777), oct(0511))
Exemplo n.º 41
0
def test_make_dirs__empty_directory():
    make_dirs("")
Exemplo n.º 42
0
def test_make_dirs__create_dir(temp_folder):
    assert not os.listdir(temp_folder)
    assert make_dirs(os.path.join(temp_folder, "test123"))
    assert_equal(os.listdir(temp_folder), ["test123"])
Exemplo n.º 43
0
def test_make_dirs__empty_directory():
    make_dirs("")
Exemplo n.º 44
0
 def _run(self, _config, _temp):
     check_fastq_files(self.input_files, self._offset, True)
     output_file = tuple(self.output_files)[0]
     make_dirs(os.path.dirname(output_file))
     with open(output_file, "w"):
         pass
Exemplo n.º 45
0
def test_make_dirs__create_dir(temp_folder):
    assert not os.listdir(temp_folder)
    assert make_dirs(os.path.join(temp_folder, "test123"))
    assert_equal(os.listdir(temp_folder), ["test123"])
Exemplo n.º 46
0
def main(argv):
    config, args = parse_options(argv)
    if config is None:
        return 1

    # Get default options for bam_pipeline
    bam_config, _ = bam_cfg.parse_config(args, "bam")
    makefiles = bam_pipeline.read_makefiles(bam_config, args)
    # Build .fai files for reference .fasta files
    bam_pipeline.index_references(bam_config, makefiles)

    for makefile in makefiles:
        mkfile_fname = makefile["Statistics"]["Filename"]
        bam_config.destination = os.path.dirname(mkfile_fname)
        tasks = bam_pipeline.build_pipeline_full(bam_config, makefile,
                                                 return_nodes=False)

        make_dirs(config.destination)
        makefile_name = add_postfix(makefile["Statistics"]["Filename"],
                                    config.postfix)
        makefile_path = reroot_path(config.destination, makefile_name)
        if samefile(makefile["Statistics"]["Filename"], makefile_path):
            sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path,))
            sys.stderr.write("       Please set --destination and/or --output-name-postfix\n")
            sys.stderr.write("       before continuing.\n")
            return 1

        print("Writing makefile", makefile_path)

        found_prefix = False
        for prefix in makefile["Prefixes"]:
            if prefix != config.prefix:
                print("%sSkipping %s" % (_INDENTATION, prefix))
            else:
                found_prefix = True

        if not found_prefix:
            sys.stderr.write("\nERROR:\n")
            sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n"
                             % (config.prefix, mkfile_fname))
            return 1

        with open(makefile_path, "w") as makefile_handle:
            template = bam_mkfile.build_makefile(add_sample_tmpl=False)
            makefile_handle.write(template)
            makefile_handle.write("\n" * 3)

            for target in tasks:
                target_name = add_postfix(target.name, config.postfix)
                print("%sTarget: %s -> %s" % (_INDENTATION,
                                              target.name,
                                              target_name))

                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0,
                                                     target_name))
                for prefix in target.prefixes:
                    if prefix.name != config.prefix:
                        continue

                    for sample in prefix.samples:
                        print("%sSample: %s" % (_INDENTATION * 2, sample.name))

                        makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1,
                                                             sample.name))

                        for library in sample.libraries:
                            print("%sLibrary: %s" % (_INDENTATION * 3,
                                                     library.name))
                            makefile_handle.write('%s"%s":\n'
                                                  % (_INDENTATION * 2,
                                                     library.name))

                            sink_cache = {}
                            destination = os.path.join(target_name,
                                                       "reads",
                                                       sample.name,
                                                       library.name)

                            for lane in library.lanes:
                                convert_reads(config, destination, lane, sink_cache)
                            ReadSink.close_all_sinks()

                            for lane_name in sorted(sink_cache):
                                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 3, lane_name))
                                for (reads_type, sink) in sorted(sink_cache[lane_name].items()):
                                    makefile_handle.write('%s%s "%s"\n'
                                                          % (_INDENTATION * 4,
                                                             ("%s:" % (reads_type,)).ljust(20),
                                                             sink.filename))
                                makefile_handle.write("\n")
        print("\tDone ...")
        print()

    return 0
Exemplo n.º 47
0
def setup_mito_mapping(config):
    genomes_root = os.path.join(config.destination, "genomes")
    if not os.path.exists(genomes_root):
        fileutils.make_dirs(genomes_root)

    mkfile_fpath = os.path.join(config.destination, "makefile.yaml")

    filenames = [mkfile_fpath]
    for name, record in sorted(config.database.mitochondria.iteritems()):
        filenames.append(os.path.join(genomes_root, "%s.fasta"
                                      % (record.name,)))

    existing_filenames = [filename for filename in filenames
                          if os.path.exists(filename)]

    # A bit strict, but avoid accidential overwrites
    if existing_filenames:
        print_err("ERROR: Output file(s) already exists, "
                  "cannot proceed:\n    %s"
                  % ("\n    ".join(map(repr, existing_filenames),)))

        return 1

    with open(mkfile_fpath, "w") as mkfile:
        mkfile.write(bam_mkfile.build_makefile(add_prefix_tmpl=False,
                                               add_sample_tmpl=False))

        mkfile.write("\n\nPrefixes:\n")

        for name, record in sorted(config.database.mitochondria.iteritems()):
            meta = (record.meta or "").upper()
            if "EXCLUDE" in meta:
                continue

            mkfile.write("  %s:\n" % (record.name,))
            mkfile.write("    Path: genomes/%s.fasta\n" % (record.name,))

            info = config.database.samples.get(record.name)
            if info is not None:
                mkfile.write("    # Group: %s\n"
                             % (info.get('Group(3)', 'NA'),))
                mkfile.write("    # Species: %s\n"
                             % (info.get('Species', 'NA'),))
                mkfile.write("    # Sex: %s\n"
                             % (info.get('Sex', 'NA'),))
                mkfile.write("    # Publication: %s\n"
                             % (info.get('Publication', 'NA'),))
                mkfile.write("    # Sample ID: %s\n"
                             % (info.get('SampleID', 'NA'),))

            mkfile.write('\n')

            fasta_fpath = os.path.join(genomes_root,
                                       "%s.fasta" % (record.name,))

            with open(fasta_fpath, "w") as fasta_handle:
                record = FASTA(
                    name=record.name,
                    meta=None,
                    sequence=record.sequence.replace('-', ''))

                fasta_handle.write(str(record))
                fasta_handle.write("\n")

        mkfile.write("\n")

    return 0
Exemplo n.º 48
0
def test_make_dirs__subdirs_return_values(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "test"))
    assert make_dirs(os.path.join(temp_folder, "test", "234"))
    assert not make_dirs(os.path.join(temp_folder, "test", "234"))
Exemplo n.º 49
0
def test_make_dirs__subdirs_return_values(temp_folder):
    assert make_dirs(os.path.join(temp_folder, "test"))
    assert make_dirs(os.path.join(temp_folder, "test", "234"))
    assert not make_dirs(os.path.join(temp_folder, "test", "234"))
Exemplo n.º 50
0
def setup_mito_mapping(config):
    genomes_root = os.path.join(config.destination, "genomes")
    if not os.path.exists(genomes_root):
        fileutils.make_dirs(genomes_root)

    mkfile_fpath = os.path.join(config.destination, "makefile.yaml")

    filenames = [mkfile_fpath]
    for name, record in sorted(config.database.mitochondria.items()):
        filenames.append(
            os.path.join(genomes_root, "%s.fasta" % (record.name, )))

    existing_filenames = [
        filename for filename in filenames if os.path.exists(filename)
    ]

    # A bit strict, but avoid accidential overwrites
    if existing_filenames:
        log = logging.getLogger(__name__)
        log.error("Output file(s) already exists, cannot proceed:")
        for filename in sorted(existing_filenames):
            log.error(" - %r", filename)

        return 1

    with open(mkfile_fpath, "w") as mkfile:
        mkfile.write(
            bam_mkfile.build_makefile(add_prefix_tmpl=False,
                                      add_sample_tmpl=False))

        mkfile.write("\n\nPrefixes:\n")

        for name, record in sorted(config.database.mitochondria.items()):
            if "EXCLUDE" in record.meta.upper():
                continue

            mkfile.write("  %s:\n" % (record.name, ))
            mkfile.write("    Path: genomes/%s.fasta\n" % (record.name, ))

            info = config.database.samples.get(record.name)
            if info is not None:
                mkfile.write("    # Species: %s\n" %
                             (info.get("Species", "NA"), ))
                mkfile.write("    # Sex: %s\n" % (info.get("Sex", "NA"), ))
                mkfile.write("    # Publication: %s\n" %
                             (info.get("Publication", "NA"), ))
                mkfile.write("    # Sample ID: %s\n" %
                             (info.get("SampleID", "NA"), ))

            mkfile.write("\n")

            fasta_fpath = os.path.join(genomes_root,
                                       "%s.fasta" % (record.name, ))

            with open(fasta_fpath, "w") as fasta_handle:
                record = FASTA(
                    name=record.name,
                    meta=None,
                    sequence=record.sequence.replace("-", ""),
                )

                record.write(fasta_handle)

        mkfile.write("\n")

    return 0
Exemplo n.º 51
0
def setup_mito_mapping(config):
    genomes_root = os.path.join(config.destination, "genomes")
    if not os.path.exists(genomes_root):
        fileutils.make_dirs(genomes_root)

    mkfile_fpath = os.path.join(config.destination, "makefile.yaml")

    filenames = [mkfile_fpath]
    for name, record in sorted(config.database.mitochondria.iteritems()):
        filenames.append(os.path.join(genomes_root, "%s.fasta"
                                      % (record.name,)))

    existing_filenames = [filename for filename in filenames
                          if os.path.exists(filename)]

    # A bit strict, but avoid accidential overwrites
    if existing_filenames:
        print_err("ERROR: Output file(s) already exists, "
                  "cannot proceed:\n    %s"
                  % ("\n    ".join(map(repr, existing_filenames),)))

        return 1

    with open(mkfile_fpath, "w") as mkfile:
        mkfile.write(bam_mkfile.build_makefile(add_prefix_tmpl=False,
                                               add_sample_tmpl=False))

        mkfile.write("\n\nPrefixes:\n")

        for name, record in sorted(config.database.mitochondria.iteritems()):
            meta = (record.meta or "").upper()
            if "EXCLUDE" in meta:
                continue

            mkfile.write("  %s:\n" % (record.name,))
            mkfile.write("    Path: genomes/%s.fasta\n" % (record.name,))

            info = config.database.samples.get(record.name)
            if info is not None:
                mkfile.write("    # Group: %s\n"
                             % (info.get('Group(3)', 'NA'),))
                mkfile.write("    # Species: %s\n"
                             % (info.get('Species', 'NA'),))
                mkfile.write("    # Sex: %s\n"
                             % (info.get('Sex', 'NA'),))
                mkfile.write("    # Publication: %s\n"
                             % (info.get('Publication', 'NA'),))
                mkfile.write("    # Sample ID: %s\n"
                             % (info.get('SampleID', 'NA'),))

            mkfile.write('\n')

            fasta_fpath = os.path.join(genomes_root,
                                       "%s.fasta" % (record.name,))

            with open(fasta_fpath, "w") as fasta_handle:
                fasta_handle.write(str(record))
                fasta_handle.write("\n")

        mkfile.write("\n")

    return 0
Exemplo n.º 52
0
def main(argv):
    config, args = parse_options(argv)
    if config is None:
        return 1

    # Get default options for bam_pipeline
    bam_config, _ = bam_cfg.parse_config(args, "bam")
    makefiles = bam_pipeline.read_makefiles(bam_config, args)
    # Build .fai files for reference .fasta files
    bam_pipeline.index_references(bam_config, makefiles)

    for makefile in makefiles:
        mkfile_fname = makefile["Statistics"]["Filename"]
        bam_config.destination = os.path.dirname(mkfile_fname)
        tasks = bam_pipeline.build_pipeline_full(bam_config, makefile,
                                                 return_nodes=False)

        make_dirs(config.destination)
        makefile_name = add_postfix(makefile["Statistics"]["Filename"],
                                    config.postfix)
        makefile_path = reroot_path(config.destination, makefile_name)
        if samefile(makefile["Statistics"]["Filename"], makefile_path):
            sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path,))
            sys.stderr.write("       Please set --destination and/or --output-name-postfix\n")
            sys.stderr.write("       before continuing.\n")
            return 1

        print("Writing makefile", makefile_path)

        found_prefix = False
        for prefix in makefile["Prefixes"]:
            if prefix != config.prefix:
                print("%sSkipping %s" % (_INDENTATION, prefix))
            else:
                found_prefix = True

        if not found_prefix:
            sys.stderr.write("\nERROR:\n")
            sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n"
                             % (config.prefix, mkfile_fname))
            return 1

        with open(makefile_path, "w") as makefile_handle:
            template = bam_mkfile.build_makefile(add_sample_tmpl=False)
            makefile_handle.write(template)
            makefile_handle.write("\n" * 3)

            for target in tasks:
                target_name = add_postfix(target.name, config.postfix)
                print("%sTarget: %s -> %s" % (_INDENTATION,
                                              target.name,
                                              target_name))

                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0,
                                                     target_name))
                for prefix in target.prefixes:
                    if prefix.name != config.prefix:
                        continue

                    for sample in prefix.samples:
                        print("%sSample: %s" % (_INDENTATION * 2, sample.name))

                        makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1,
                                                             sample.name))

                        for library in sample.libraries:
                            print("%sLibrary: %s" % (_INDENTATION * 3,
                                                     library.name))
                            makefile_handle.write('%s"%s":\n'
                                                  % (_INDENTATION * 2,
                                                     library.name))

                            sink_cache = {}
                            destination = os.path.join(target_name,
                                                       "reads",
                                                       sample.name,
                                                       library.name)

                            for lane in library.lanes:
                                convert_reads(config, destination, lane, sink_cache)
                            ReadSink.close_all_sinks()

                            for lane_name in sorted(sink_cache):
                                makefile_handle.write('%s"%s":\n' % (_INDENTATION * 3, lane_name))
                                for (reads_type, sink) in sorted(sink_cache[lane_name].items()):
                                    makefile_handle.write('%s%s "%s"\n'
                                                          % (_INDENTATION * 4,
                                                             ("%s:" % (reads_type,)).ljust(20),
                                                             sink.filename))
                                makefile_handle.write("\n")
        print("\tDone ...")
        print()

    return 0
Exemplo n.º 53
0
def test_make_dirs__sub_directories(temp_folder):
    assert not os.listdir(temp_folder)
    assert make_dirs(os.path.join(temp_folder, "test", "123"))
    assert_equal(os.listdir(temp_folder), ["test"])
    assert_equal(os.listdir(os.path.join(temp_folder, "test")), ["123"])