Пример #1
0
def _as_seqdict_genome_regions(regions, minsize=None):
    """
    Accepts list of regions where the genome is encoded in the region,
    using the genome@chrom:start-end format.
    """
    genomic_regions = {}
    for region in regions:
        genome, region = region.split("@")
        genomic_regions.setdefault(genome, []).append(region)

    # test if all genomes are installed
    for genome in genomic_regions:
        Genome(genome)

    tmpfa = NamedTemporaryFile(mode="w", delete=False)
    for genome, g_regions in genomic_regions.items():
        g = Genome(genome)

        fa = g.track2fasta(g_regions)

        for seq in fa:
            seq.name = f"{genome}@{seq.name}"
            print(seq.__repr__(), file=tmpfa)

    tmpfa.flush()

    # Open tempfile and restore original sequence order
    fa = as_seqdict(tmpfa.name)
    fa = {region: fa[region] for region in regions}
    return _check_minsize(fa, minsize)
Пример #2
0
def _genomepy_convert(to_convert, genome, minsize=None):
    """
    Convert a variety of inputs using track2fasta().
    """
    if genome is None:
        raise ValueError("input file is not a FASTA file, need a genome!")

    g = Genome(genome)
    tmpfile = NamedTemporaryFile()
    g.track2fasta(to_convert, tmpfile.name)

    fa = as_seqdict(tmpfile.name)
    return _check_minsize(fa, minsize)
Пример #3
0
def generate_exports(genomes_dir=None):
    """Print export commands for setting environment variables."""
    env = []
    for name in list_installed_genomes(genomes_dir):
        try:
            g = Genome(name)
            env_name = re.sub(r"[^\w]+", "_", name).upper()
            env.append(f"export {env_name}={g.filename}")
        except (FastaIndexingError, FileNotFoundError):
            pass
    return env
Пример #4
0
def install_genome(
    name,
    provider=None,
    genomes_dir=None,
    localname=None,
    mask="soft",
    keep_alt=False,
    regex=None,
    invert_match=False,
    bgzip=None,
    annotation=False,
    only_annotation=False,
    skip_sanitizing=False,
    threads=1,
    force=False,
    **kwargs,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the annotation files.

    skip_sanitizing : bool , optional
        If set to True, downloaded annotation files whose sequence names do not match
        with the (first header fields of) the genome.fa will not be corrected.

    kwargs : dict , optional
        Provider specific options.
        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)

    # Check if genome already exists, or if downloading is forced
    genome_found = _is_genome_dir(out_dir)
    if (not genome_found or force) and not only_annotation:
        # Download genome from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_genome(
            name,
            genomes_dir,
            mask=mask,
            keep_alt=keep_alt,
            regex=regex,
            invert_match=invert_match,
            localname=localname,
            bgzip=bgzip,
            **kwargs,
        )
        genome_found = True

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    # Generates a Fasta object, index, gaps and sizes file
    g = None
    if genome_found:
        g = Genome(localname, genomes_dir=genomes_dir)
        if force:
            # overwrite previous versions
            generate_fa_sizes(g.genome_file, g.sizes_file)
            generate_gap_bed(g.genome_file, g.gaps_file)

    # Check if any annotation flags are given, if annotation already exists, or if downloading is forced
    if any([
            annotation,
            only_annotation,
            skip_sanitizing,
            kwargs.get("to_annotation"),
            kwargs.get("ucsc_annotation_type"),
    ]):
        annotation = True
    annotation_found = bool(glob_ext_files(out_dir, "gtf"))
    if (not annotation_found or force) and annotation:
        # Download annotation from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_annotation(name, genomes_dir, localname=localname, **kwargs)

        # Sanitize annotation if needed (requires genome)
        annotation_found = bool(glob_ext_files(out_dir, "gtf"))
        if genome_found and annotation_found and not skip_sanitizing:
            sanitize_annotation(g)

    if genome_found:
        # Run all active plugins (requires genome)
        for plugin in get_active_plugins():
            plugin.after_genome_download(g, threads, force)
Пример #5
0
def install_genome(
    name: str,
    provider: Optional[str] = None,
    genomes_dir: Optional[str] = None,
    localname: Optional[str] = None,
    mask: Optional[str] = "soft",
    keep_alt: Optional[bool] = False,
    regex: Optional[str] = None,
    invert_match: Optional[bool] = False,
    bgzip: Optional[bool] = None,  # None -> check config. False -> dont check.
    annotation: Optional[bool] = False,
    only_annotation: Optional[bool] = False,
    skip_matching: Optional[bool] = False,
    skip_filter: Optional[bool] = False,
    threads: Optional[int] = 1,
    force: Optional[bool] = False,
    **kwargs: Optional[dict],
) -> Genome:
    """
    Install a genome (& gene annotation).

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to create the output folder.

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Genome masking of repetitive sequences. Options: hard/soft/none, default is soft.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that *don't* match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip,
        and gene annotation will be compressed with gzip.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads.

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the gene annotation files.

    skip_matching : bool , optional
        If set to True, contigs in the annotation not matching
        those in the genome will not be corrected.

    skip_filter : bool , optional
        If set to True, the gene annotations will not be filtered to match the genome contigs.

    kwargs : dict , optional
        Provider specific options.

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.

    Returns
    -------
    Genome
        Genome class with the installed genome
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)
    genome_file = os.path.join(out_dir, f"{localname}.fa")
    provider = _provider_selection(name, localname, genomes_dir, provider)

    # check which files need to be downloaded
    genome_found = _is_genome_dir(out_dir)
    download_genome = (
        genome_found is False or force is True
    ) and only_annotation is False
    annotation_found = bool(glob_ext_files(out_dir, "annotation.gtf")) and bool(
        glob_ext_files(out_dir, "annotation.bed")
    )
    download_annotation = (annotation_found is False or force is True) and any(
        [
            annotation,
            only_annotation,
            skip_matching,
            skip_filter,
            kwargs.get("to_annotation"),
            kwargs.get("path_to_annotation"),
            kwargs.get("ucsc_annotation_type"),
        ]
    )

    genome = None
    genome_downloaded = False
    if download_genome:
        if force:
            _delete_extensions(out_dir, ["fa", "fai"])
        provider.download_genome(
            name,
            genomes_dir,
            mask=mask,
            localname=localname,
            **kwargs,
        )
        genome_found = True
        genome_downloaded = True

        # Filter genome
        _filter_genome(genome_file, regex, invert_match, keep_alt)

        # Generates a Fasta object and the genome index, gaps and sizes files
        genome = Genome(localname, genomes_dir=genomes_dir)

        # Download the NCBI assembly report
        asm_report = os.path.join(out_dir, "assembly_report.txt")
        asm_acc = genome.assembly_accession
        if not os.path.exists(asm_report) and asm_acc != "na":
            download_assembly_report(asm_acc, asm_report)

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    annotation_downloaded = False
    if download_annotation:
        if force:
            _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"])
        provider.download_annotation(name, genomes_dir, localname=localname, **kwargs)
        annotation_downloaded = bool(
            glob_ext_files(out_dir, "annotation.gtf")
        ) and bool(glob_ext_files(out_dir, "annotation.bed"))

    if annotation_downloaded:
        annotation = Annotation(localname, genomes_dir=genomes_dir)
        if genome_found and not (skip_matching and skip_filter):
            annotation.sanitize(not skip_matching, not skip_filter, True)

    # Run active plugins (also if the genome was downloaded earlier)
    if genome_found:
        genome = genome if genome else Genome(localname, genomes_dir=genomes_dir)
        for plugin in get_active_plugins():
            plugin.after_genome_download(genome, threads, force)

    # zip files downloaded now
    if bgzip is True or (bgzip is None and config.get("bgzip")):
        if genome_downloaded:
            bgzip_and_name(genome.filename)
        if annotation_downloaded:
            gzip_and_name(annotation.annotation_gtf_file)
            gzip_and_name(annotation.annotation_bed_file)

    return genome