Пример #1
0
def search(term, provider=None):
    """
    Search for a genome.

     If provider is specified, search only that specific provider, else
     search all providers. Both the name and description are used for the
     search. Search term is case-insensitive.

    Parameters
    ----------
    term : str
        Search term, case-insensitive.

    provider : str , optional
        Provider name

    Yields
    ------
    tuple
        genome information (name/identfier and description)
    """
    if provider:
        providers = [ProviderBase.create(provider)]
    else:
        # if provider is not specified search all providers (except direct url)
        providers = [
            ProviderBase.create(p) for p in ProviderBase.list_providers()
            if p != "url"
        ]
    for p in providers:
        for row in p.search(term):
            yield [x.encode("latin-1") for x in [p.name] + list(row)]
Пример #2
0
def list_available_genomes(provider=None):
    """
    List all available genomes.

    Parameters
    ----------
    provider : str, optional
        List genomes from specific provider. Genomes from all
        providers will be returned if not specified.

    Returns
    -------
    list with genome names
    """
    if provider:
        providers = [ProviderBase.create(provider)]
    else:
        # if provider is not specified search all providers
        providers = [
            ProviderBase.create(p) for p in ProviderBase.list_providers()
        ]

    for p in providers:
        for row in p.list_available_genomes():
            yield [p.name] + list(row)
Пример #3
0
def search(term, provider=None):
    """
    Search for a genome.

     If provider is specified, search only that specific provider, else 
     search all providers. Both the name and description are used for the 
     search. Seacrch term is case-insensitive.

    Parameters
    ----------
    term : str
        Search term, case-insensitive.
    
    provider : str , optional
        Provider name

    Yields
    ------
    tuple
        genome information (name/identfier and description)
    """
    if provider:
        providers = [ProviderBase.create(provider)]
    else:
        # if provider is not specified search all providers
        providers = [ProviderBase.create(p) for 
                        p in ProviderBase.list_providers()]

    for p in providers:
        for row in p.search(term):
            yield [x.encode('latin-1') for x in [p.name] + list(row)]
Пример #4
0
def _online_providers():
    """Return a list of online providers as objects"""
    providers = []
    for p in ProviderBase.list_providers():
        try:
            providers.append(ProviderBase.create(p))
        except ConnectionError as e:
            sys.stderr.write(str(e))
    return providers
Пример #5
0
def ensembl_genome_info(genome_name: str) -> Tuple[str, str, str]:
    """Return Ensembl genome information for a local genome managed by genomepy.

    Parameters
    ----------
    genome_name : str
        Name of local genome.

    Returns
    -------
    (str, str, str)
        Ensembl name, accession, taxonomy_id
    """
    # Fast lookup for some common queries
    common_names = {
        "danRer11": "GRCz11",
        "hg38": "GRCh38",
        "mm10": "GRCm38",
        "dm6": "BDGP6.28",
    }
    if genome_name in common_names:
        search_term = common_names[genome_name]
    else:
        try:
            genome = Genome(genome_name)
            search_term = genome.tax_id
        except FileNotFoundError:
            logger.info(f"Genome {genome_name} not installed locally")
            p = ProviderBase.create("Ensembl")
            for name, *_rest in p.search(genome_name):
                if name == genome_name:
                    logger.info(
                        f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation"
                    )
                    return None
            return None

    # search Ensembl by taxonomy_id or by specific Ensembl name (if we know it)
    p = ProviderBase.create("Ensembl")
    name, accession, species, tax_id, *rest = [
        row for row in p.search(search_term)
    ][0]

    # Check if the assembly_id of the current Ensembl genome is the same as the
    # local genome. If it is identical, we can correctly assume that the genomes
    # sequences are identical.
    # For the genomes in the lookup table, we already know they match.
    if genome_name in common_names or accession == genome.assembly_accession:
        return name, accession, tax_id
    else:
        print(f"Could not find a matching genome in Ensembl")
        return None
Пример #6
0
def install_genome(name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    version : str
        Version (only for Ensembl)

    genome_dir : str , optional
        Where to store the fasta files
    
    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', specify 'hard' for hard masking.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError("Please provide or configure a genome_dir")
   
    genome_dir = os.path.expanduser(genome_dir)
    
    # Download genome from provider
    p = ProviderBase.create(provider)
    name = p.download_genome(
            name, 
            genome_dir, 
            version=version,
            mask=mask, 
            localname=localname, 
            regex=regex, 
            invert_match=invert_match)

    if annotation:
        # Download annotation from provider
        p.download_annotation(name, genome_dir, version=version)

    g = Genome(name, genome_dir=genome_dir)
    for plugin in get_active_plugins():
        plugin.after_genome_download(g)
Пример #7
0
def list_available_providers():
    """
    List all available providers.

    Returns
    -------
    list with provider names
    """
    return ProviderBase.list_providers()
Пример #8
0
def list_available_providers():
    """
    List all available providers.

    Returns
    -------
    list with provider names
    """
    return ProviderBase.list_providers()
Пример #9
0
def list_available_genomes(provider=None):
    """
    List all available genomes.

    Parameters
    ----------
    provider : str, optional
        List genomes from specific provider. Genomes from all
        providers will be returned if not specified.

    Returns
    -------
    list with genome names
    """
    if provider:
        providers = [ProviderBase.create(provider)]
    else:
        # if provider is not specified search all providers
        providers = [ProviderBase.create(p) for 
                        p in ProviderBase.list_providers()]

    for p in providers:
        for row in p.list_available_genomes():
            yield [p.name] + list(row)
Пример #10
0
def test__update_assembly_accession(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # genome not found
    metadata = {}
    g._update_assembly_accession(metadata)
    assert metadata["assembly_accession"] == "na"

    # genome found
    metadata = {}
    provider = ProviderBase.create("NCBI")
    genome = provider.genomes.get("ASM14646v1")

    g._update_assembly_accession(metadata, provider, genome)
    assert metadata["assembly_accession"] == "GCA_000146465.1"
Пример #11
0
def test__update_tax_id(genome="tests/data/small_genome.fa.gz"):
    g = genomepy.Genome(genome)

    # genome not found
    metadata = {}
    g._update_tax_id(metadata)
    assert metadata["tax_id"] == "na"

    # genome found
    metadata = {}
    provider = ProviderBase.create("NCBI")
    genome = provider.genomes.get("ASM14646v1")

    g._update_tax_id(metadata, provider, genome)
    assert metadata["tax_id"] == "58839"
Пример #12
0
def ncbi_assembly_report(asm_acc: str) -> pd.DataFrame:
    """Retrieve the NCBI assembly report as a DataFrame.

    Parameters
    ----------
    asm_acc : str
        Assembly accession (GCA or GCF)

    Returns
    -------
    pandas.DataFrame
        NCBI assembly report.
    """
    p = ProviderBase.create("NCBI")
    ncbi_search = list(p.search(asm_acc))
    if len(ncbi_search) > 1:
        raise Exception("More than one genome for accession")
    else:
        ncbi_name = ncbi_search[0][0].replace(" ", "_")

    # NCBI FTP location of assembly report
    logger.info(f"Found NCBI assembly {asm_acc} with name {ncbi_name}")
    assembly_report = (
        f"ftp://ftp.ncbi.nlm.nih.gov/genomes/all/{asm_acc[0:3]}/" +
        f"{asm_acc[4:7]}/{asm_acc[7:10]}/{asm_acc[10:13]}/" +
        f"{asm_acc}_{ncbi_name}/{asm_acc}_{ncbi_name}_assembly_report.txt")

    logger.info(f"Downloading {assembly_report}")
    header = [
        "Sequence-Name",
        "Sequence-Role",
        "Assigned-Molecule",
        "Assigned-Molecule-Location/Type",
        "GenBank-Accn",
        "Relationship",
        "RefSeq-Accn",
        "Assembly-Unit",
        "Sequence-Length",
        "UCSC-style-name",
    ]
    asm_report = pd.read_csv(assembly_report,
                             sep="\t",
                             comment="#",
                             names=header)
    return asm_report
Пример #13
0
    def _update_metadata(self, metadata):
        """check if there is missing info that can be updated"""
        print("Updating metadata in README.txt", file=sys.stderr)
        if metadata.get("provider", "na") == "na":
            self._update_provider(metadata)

        known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
        name = safe(metadata.get("original name", ""))
        missing_info = any(key not in metadata
                           for key in ["tax_id", "assembly_accession"])
        p = genome = None
        if known_provider and name and missing_info:
            p = ProviderBase.create(metadata["provider"])
            genome = p.genomes.get(name)

        if "tax_id" not in metadata:
            self._update_tax_id(metadata, p, genome)
        if "assembly_accession" not in metadata:
            self._update_assembly_accession(metadata, p, genome)
Пример #14
0
def install_genome(name,
                   provider,
                   genome_dir=None,
                   localname=None,
                   mask="soft",
                   regex=None,
                   invert_match=False,
                   bgzip=None,
                   annotation=False,
                   force=False,
                   **kwargs):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    genome_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    force : bool , optional
        Set to True to overwrite existing files.

    kwargs : dict, optional
        Provider specific options.
        Ensembl:

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int, optional
            Ensembl only: Specify release version. Default is latest.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError(
            "Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)
    localname = get_localname(name, localname)
    out_dir = os.path.join(genome_dir, localname)

    # Check if genome already exists, or if downloading is forced
    no_genome_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "fa"))
    if no_genome_found or force:
        # Download genome from provider
        p = ProviderBase.create(provider)
        p.download_genome(name,
                          genome_dir,
                          mask=mask,
                          regex=regex,
                          invert_match=invert_match,
                          localname=localname,
                          bgzip=bgzip,
                          **kwargs)

    # If annotation is requested, check if annotation already exists, or if downloading is forced
    no_annotation_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "gtf"))
    if annotation and (no_annotation_found or force):
        # Download annotation from provider
        p = ProviderBase.create(provider)
        p.download_annotation(name, genome_dir, localname=localname, **kwargs)

    # generates a Fasta object and the index file
    g = Genome(localname, genome_dir=genome_dir)

    # Run all active plugins
    for plugin in get_active_plugins():
        plugin.after_genome_download(g, force)

    # Generate gap file if not found or if generation is forced
    gap_file = os.path.join(out_dir, localname + ".gaps.bed")
    if not os.path.exists(gap_file) or force:
        generate_gap_bed(glob_ext_files(out_dir, "fa")[0], gap_file)

    generate_env()
Пример #15
0
def _providers(provider=None):
    """
    Return a list of provider objects:
    either the specified provider, or all online providers
    """
    return [ProviderBase.create(provider)] if provider else _online_providers()
Пример #16
0
def install_genome(name,
                   provider,
                   version=None,
                   genome_dir=None,
                   localname=None,
                   mask="soft",
                   regex=None,
                   invert_match=False,
                   annotation=False):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    version : str
        Version (only for Ensembl)

    genome_dir : str , optional
        Where to store the fasta files
    
    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', specify 'hard' for hard masking.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError(
            "Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)

    # Download genome from provider
    p = ProviderBase.create(provider)
    name = p.download_genome(name,
                             genome_dir,
                             version=version,
                             mask=mask,
                             localname=localname,
                             regex=regex,
                             invert_match=invert_match)

    if annotation:
        # Download annotation from provider
        p.download_annotation(name, genome_dir, version=version)

    # Create chromosome sizes
    generate_sizes(name, genome_dir)

    fa = os.path.join(genome_dir, name, "{}.fa".format(name))
    bed = os.path.join(genome_dir, name, "{}.gaps.bed".format(name))
    generate_gap_bed(fa, bed)
Пример #17
0
def install_genome(
    name,
    provider,
    version=None,
    genome_dir=None,
    localname=None,
    mask="soft",
    regex=None,
    invert_match=False,
    annotation=False,
    bgzip=None,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    version : str
        Version (only for Ensembl)

    genome_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', specify 'hard' for hard masking.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError("Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)
    localname = get_localname(name, localname)

    # Download genome from provider
    p = ProviderBase.create(provider)
    p.download_genome(
        name,
        genome_dir,
        version=version,
        mask=mask,
        localname=localname,
        regex=regex,
        invert_match=invert_match,
        bgzip=bgzip,
    )

    if annotation:
        # Download annotation from provider
        p.download_annotation(name, genome_dir, localname=localname, version=version)

    g = Genome(localname, genome_dir=genome_dir)

    for plugin in get_active_plugins():
        plugin.after_genome_download(g)

    generate_env()