예제 #1
0
def test_gaps():
    infa = "tests/data/gap.fa"
    outbed = "tests/data/gap.bed"

    tmp = NamedTemporaryFile().name
    generate_gap_bed(infa, tmp)

    result = open(tmp).read()
    expect = open(outbed).read()

    assert result == expect
예제 #2
0
    def gap_sizes(self):
        """Return gap sizes per chromosome.

        Returns
        -------
        gap_sizes : dict
            a dictionary with chromosomes as key and the total number of
            Ns as values
        """
        if not self._gap_sizes:
            gap_file = self.props["gaps"]["gaps"]

            # generate gap file if not found
            if not os.path.exists(gap_file):
                generate_gap_bed(self.filename, gap_file)

            self._gap_sizes = {}
            with open(gap_file) as f:
                for line in f:
                    chrom, start, end = line.strip().split("\t")
                    start, end = int(start), int(end)
                    self._gap_sizes[chrom] = self._gap_sizes.get(
                        chrom, 0) + end - start
        return self._gap_sizes
예제 #3
0
파일: genome.py 프로젝트: masastat/genomepy
 def gaps_file(self, fname):
     """generate the gaps_file when the class is initiated"""
     if not os.path.exists(fname):
         generate_gap_bed(self.genome_file, fname)
     self.__gaps_file = fname
예제 #4
0
def install_genome(name,
                   provider,
                   genome_dir=None,
                   localname=None,
                   mask="soft",
                   regex=None,
                   invert_match=False,
                   bgzip=None,
                   annotation=False,
                   force=False,
                   **kwargs):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    genome_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    force : bool , optional
        Set to True to overwrite existing files.

    kwargs : dict, optional
        Provider specific options.
        Ensembl:

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int, optional
            Ensembl only: Specify release version. Default is latest.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError(
            "Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)
    localname = get_localname(name, localname)
    out_dir = os.path.join(genome_dir, localname)

    # Check if genome already exists, or if downloading is forced
    no_genome_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "fa"))
    if no_genome_found or force:
        # Download genome from provider
        p = ProviderBase.create(provider)
        p.download_genome(name,
                          genome_dir,
                          mask=mask,
                          regex=regex,
                          invert_match=invert_match,
                          localname=localname,
                          bgzip=bgzip,
                          **kwargs)

    # If annotation is requested, check if annotation already exists, or if downloading is forced
    no_annotation_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "gtf"))
    if annotation and (no_annotation_found or force):
        # Download annotation from provider
        p = ProviderBase.create(provider)
        p.download_annotation(name, genome_dir, localname=localname, **kwargs)

    # generates a Fasta object and the index file
    g = Genome(localname, genome_dir=genome_dir)

    # Run all active plugins
    for plugin in get_active_plugins():
        plugin.after_genome_download(g, force)

    # Generate gap file if not found or if generation is forced
    gap_file = os.path.join(out_dir, localname + ".gaps.bed")
    if not os.path.exists(gap_file) or force:
        generate_gap_bed(glob_ext_files(out_dir, "fa")[0], gap_file)

    generate_env()
예제 #5
0
def install_genome(
    name,
    provider=None,
    genomes_dir=None,
    localname=None,
    mask="soft",
    keep_alt=False,
    regex=None,
    invert_match=False,
    bgzip=None,
    annotation=False,
    only_annotation=False,
    skip_sanitizing=False,
    threads=1,
    force=False,
    **kwargs,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the annotation files.

    skip_sanitizing : bool , optional
        If set to True, downloaded annotation files whose sequence names do not match
        with the (first header fields of) the genome.fa will not be corrected.

    kwargs : dict , optional
        Provider specific options.
        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)

    # Check if genome already exists, or if downloading is forced
    genome_found = _is_genome_dir(out_dir)
    if (not genome_found or force) and not only_annotation:
        # Download genome from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_genome(
            name,
            genomes_dir,
            mask=mask,
            keep_alt=keep_alt,
            regex=regex,
            invert_match=invert_match,
            localname=localname,
            bgzip=bgzip,
            **kwargs,
        )
        genome_found = True

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    # Generates a Fasta object, index, gaps and sizes file
    g = None
    if genome_found:
        g = Genome(localname, genomes_dir=genomes_dir)
        if force:
            # overwrite previous versions
            generate_fa_sizes(g.genome_file, g.sizes_file)
            generate_gap_bed(g.genome_file, g.gaps_file)

    # Check if any annotation flags are given, if annotation already exists, or if downloading is forced
    if any([
            annotation,
            only_annotation,
            skip_sanitizing,
            kwargs.get("to_annotation"),
            kwargs.get("ucsc_annotation_type"),
    ]):
        annotation = True
    annotation_found = bool(glob_ext_files(out_dir, "gtf"))
    if (not annotation_found or force) and annotation:
        # Download annotation from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_annotation(name, genomes_dir, localname=localname, **kwargs)

        # Sanitize annotation if needed (requires genome)
        annotation_found = bool(glob_ext_files(out_dir, "gtf"))
        if genome_found and annotation_found and not skip_sanitizing:
            sanitize_annotation(g)

    if genome_found:
        # Run all active plugins (requires genome)
        for plugin in get_active_plugins():
            plugin.after_genome_download(g, threads, force)
예제 #6
0
def install_genome(name,
                   provider,
                   version=None,
                   genome_dir=None,
                   localname=None,
                   mask="soft",
                   regex=None,
                   invert_match=False,
                   annotation=False):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    version : str
        Version (only for Ensembl)

    genome_dir : str , optional
        Where to store the fasta files
    
    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', specify 'hard' for hard masking.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError(
            "Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)

    # Download genome from provider
    p = ProviderBase.create(provider)
    name = p.download_genome(name,
                             genome_dir,
                             version=version,
                             mask=mask,
                             localname=localname,
                             regex=regex,
                             invert_match=invert_match)

    if annotation:
        # Download annotation from provider
        p.download_annotation(name, genome_dir, version=version)

    # Create chromosome sizes
    generate_sizes(name, genome_dir)

    fa = os.path.join(genome_dir, name, "{}.fa".format(name))
    bed = os.path.join(genome_dir, name, "{}.gaps.bed".format(name))
    generate_gap_bed(fa, bed)