示例#1
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["star"]["index_name"]
        if not cmd_ok("STAR") or (os.path.exists(index_name) and not force):
            return

        index_dir = genome.plugin["star"]["index_dir"]
        rm_rf(index_dir)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        with extracted_file(genome.filename) as fname:
            # index command
            cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " +
                   f"--genomeFastaFiles {fname} --genomeDir {index_dir} " +
                   f"--outFileNamePrefix {index_dir}")

            # if an annotation is present, generate a splice-aware index
            gtf_file = genome.annotation_gtf_file
            if gtf_file:
                with extracted_file(gtf_file) as _gtf_file:
                    # update index command with annotation
                    cmd += f" --sjdbGTFfile {_gtf_file}"

                    # Create index
                    run_index_cmd("star", cmd)
            else:
                logger.info("Creating STAR index without annotation file.")
                # Create index
                run_index_cmd("star", cmd)
示例#2
0
文件: gmap.py 项目: masastat/genomepy
    def after_genome_download(self, genome, threads=1, force=False):
        if not cmd_ok("gmap_build"):
            return

        # Create index dir
        index_dir = genome.plugin["gmap"]["index_dir"]
        if force:
            # Start from scratch
            rm_rf(index_dir)

        if not os.path.exists(index_dir):
            # unzip genome if zipped and return up-to-date genome name
            fname, bgzip = gunzip_and_name(genome.filename)

            # gmap outputs a folder named genome.name
            # its content is moved to index dir, consistent with other plugins
            tmp_dir = mkdtemp(dir=".")
            # Create index
            cmd = f"gmap_build -D {tmp_dir} -d {genome.name} {fname}"
            run_index_cmd("gmap", cmd)

            # Move files to index_dir
            src = os.path.join(tmp_dir, genome.name)
            move(src, index_dir)
            rm_rf(tmp_dir)

            # re-zip genome if unzipped
            bgzip_and_name(fname, bgzip)
示例#3
0
def download_annotation(name, annot, genomes_dir, localname, n=None):
    """
    Download the extended genePred file from the UCSC MySQL database.
    Next convert this to a BED and GTF file.
    """
    out_dir = os.path.join(genomes_dir, localname)
    mkdir_p(out_dir)
    tmp_dir = mkdtemp(dir=out_dir)
    pred_file = f"{os.path.join(tmp_dir, localname)}.annotation.extended.gp"
    gtf_file = f"{os.path.join(out_dir, localname)}.annotation.gtf"
    bed_file = f"{os.path.join(out_dir, localname)}.annotation.bed"

    # MySQL query 1: get column names for this genePred
    command = f"SHOW COLUMNS FROM {annot};"
    cols = list(query_ucsc(command, database=name))

    # drop columns the UCSC tools cannot handle
    # see https://genome.ucsc.edu/FAQ/FAQformat.html#format9
    accepted_cols = [
        "geneName",
        "name",
        "chrom",
        "strand",
        "txStart",
        "txEnd",
        "cdsStart",
        "cdsEnd",
        "exonCount",
        "exonStarts",
        "exonEnds",
        "score",
        "name2",
        "cdsStartStat",
        "cdsEndStat",
        "exonFrames",
    ]
    cols = [c[0] for c in cols if c[0] in accepted_cols]
    cols = ",".join(cols)

    # MySQL query 2: download genePred
    command = f"SELECT {cols} FROM {annot};"
    if n:
        command = f"SELECT {cols} FROM {annot} LIMIT {n};"
    ret = query_ucsc(command, database=name)

    # clean up genePred
    df = pd.DataFrame.from_records(ret)
    for c in [8, 9, 14]:
        if c in df:
            df[c] = df[c].str.decode("utf-8")
    df.to_csv(pred_file, index=False, header=False, sep="\t")

    # convert genePred to GTF and BED
    cmd = "genePredToGtf -source=genomepy file {0} {1}"
    sp.check_call(cmd.format(pred_file, gtf_file), shell=True)
    cmd = "genePredToBed {0} {1}"
    sp.check_call(cmd.format(pred_file, bed_file), shell=True)
    rm_rf(tmp_dir)
示例#4
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["hisat2"]["index_name"]
        if not cmd_ok("hisat2-build") or (
            os.path.exists(f"{index_name}.1.ht2") and not force
        ):
            return

        index_dir = genome.plugin["hisat2"]["index_dir"]
        rm_rf(index_dir)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        fname, bgzip = gunzip_and_name(genome.filename)

        # index command
        cmd = f"hisat2-build -p {threads} {fname} {index_name}"

        # if an annotation is present, generate a splice-aware index
        gtf_file = genome.annotation_gtf_file
        if gtf_file:
            # gunzip if gzipped
            gtf_file, gzip_file = gunzip_and_name(gtf_file)

            # generate splice and exon site files to enhance indexing
            hisat_path = (
                sp.Popen("which hisat2", stdout=sp.PIPE, shell=True)
                .stdout.read()
                .decode("utf8")
                .strip()
            )
            splice_script = hisat_path + "_extract_splice_sites.py"
            splice_file = os.path.join(genome.genome_dir, "splice_sites.txt")
            sp.check_call(
                f"python3 {splice_script} {gtf_file} > {splice_file}", shell=True
            )

            exon_script = hisat_path + "_extract_exons.py"
            exon_file = os.path.join(genome.genome_dir, "exon_sites.txt")
            sp.check_call(f"python3 {exon_script} {gtf_file} > {exon_file}", shell=True)

            # re-gzip annotation if gunzipped
            gzip_and_name(gtf_file, gzip_file)

            # update index command with annotation
            cmd += f" --ss {splice_file} --exon {exon_file}"
        else:
            print("\nCreating Hisat2 index without annotation file.")

        # Create index
        run_index_cmd("hisat2", cmd)

        # re-bgzip genome if gunzipped
        bgzip_and_name(fname, bgzip)
示例#5
0
    def after_genome_download(self, genome, threads=1, force=False):
        if not cmd_ok("minimap2"):
            return

        # Create index dir
        index_dir = genome.plugin["minimap2"]["index_dir"]
        index_name = genome.plugin["minimap2"]["index_name"]
        if force:
            # Start from scratch
            rm_rf(index_dir)
        mkdir_p(index_dir)

        if not any(fname.endswith(".mmi") for fname in os.listdir(index_dir)):
            # Create index
            cmd = f"minimap2 -t {threads} -d {index_name} {genome.filename}"
            run_index_cmd("minimap2", cmd)
示例#6
0
    def after_genome_download(self, genome, threads=1, force=False):
        if not cmd_ok("bwa"):
            return

        # Create index dir
        index_dir = genome.plugin["bwa"]["index_dir"]
        index_name = genome.plugin["bwa"]["index_name"]
        if force:
            # Start from scratch
            rm_rf(index_dir)
        mkdir_p(index_dir)

        if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)):
            # Create index
            if not os.path.exists(index_name):
                os.symlink(genome.filename, index_name)
            cmd = f"bwa index {index_name}"
            run_index_cmd("bwa", cmd)
示例#7
0
def extract_tarball(fname, outfile=None, concat=True) -> Union[str, None]:
    """Convert tar of multiple FASTAs to one file."""
    fnames = []
    # Extract files to temporary directory
    tmp_dir = mkdtemp(dir=os.path.dirname(outfile))
    with tarfile.open(fname) as tar:
        tar.extractall(path=tmp_dir)
    for root, _, files in os.walk(tmp_dir):
        fnames += [os.path.join(root, fname) for fname in files]

    if len(fnames) > 1 and not concat:
        raise ValueError("tarball contains multiple files, but concat not specified!")

    # Concatenate (also works woth one file)
    with open(outfile, "w") as out:
        for infile in fnames:
            for line in open(infile):
                out.write(line)

    rm_rf(tmp_dir)

    return outfile
示例#8
0
def generate_annot(template, target, overwrite=False):
    """
    Create an annotation file type from the other file type.

    Parameters
    ----------
    template: str
        a GTF or BED filepath.
    target: str
        filepath to save the new annotation to.
    overwrite: bool, optional
        overwrite existing target file?
    """
    exts = os.path.basename(template.lower()).split(".")
    exts = [e for e in exts if e in ["gtf", "bed"]]
    if len(exts) == 0:
        raise ValueError("Template file must be in GTF or BED format.")
    template_ext = exts[-1]

    if not overwrite and os.path.exists(target):
        raise FileExistsError(f"{target} already exists! Set overwrite=True to ignore.")

    target_dir = os.path.dirname(target)
    tmp_dir = mkdtemp(dir=target_dir)
    tmp_target = os.path.join(tmp_dir, "new_annot")

    if template_ext == "bed":
        cmd = "bedToGenePred {0} /dev/stdout | genePredToGtf -source=genomepy file /dev/stdin {1}"
    else:
        cmd = "gtfToGenePred -genePredExt -ignoreGroupsWithoutExons {0} /dev/stdout | genePredToBed /dev/stdin {1}"

    # unzip template if needed
    with extracted_file(template) as _template:
        sp.check_call(cmd.format(_template, tmp_target), shell=True)
        # gzip if needed
        tmp_target = gzip_and_name(tmp_target, target.endswith(".gz"))

    shutil.move(tmp_target, target)
    rm_rf(tmp_dir)
示例#9
0
    def after_genome_download(self, genome, threads=1, force=False):
        index_name = genome.plugin["star"]["index_name"]
        if not cmd_ok("STAR") or (os.path.exists(index_name) and not force):
            return

        index_dir = genome.plugin["star"]["index_dir"]
        rm_rf(index_dir)
        mkdir_p(index_dir)

        # gunzip genome if bgzipped and return up-to-date genome name
        fname, bgzip = gunzip_and_name(genome.filename)

        # index command
        cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " +
               f"--genomeFastaFiles {fname} --genomeDir {index_dir} " +
               f"--outFileNamePrefix {index_dir}")

        # if an annotation is present, generate a splice-aware index
        gtf_file = genome.annotation_gtf_file
        gzip_file = False
        if gtf_file:
            # gunzip if gzipped
            gtf_file, gzip_file = gunzip_and_name(gtf_file)

            # update index command with annotation
            cmd += f" --sjdbGTFfile {gtf_file}"
        else:
            print("\nCreating STAR index without annotation file.")

        # Create index
        run_index_cmd("star", cmd)

        # re-bgzip genome if gunzipped
        bgzip_and_name(fname, bgzip)

        # re-gzip annotation if gunzipped
        if gtf_file:
            gzip_and_name(gtf_file, gzip_file)
示例#10
0
def head_annotations(name: str, provider=None, n: int = 2):
    """
    Quickly inspect the metadata of each available annotation for the specified genome.

    For UCSC, up to 4 gene annotation styles are available:
    "ncbiRefSeq", "refGene", "ensGene", "knownGene" (respectively).

    For NCBI, the chromosome names are not yet sanitized.

    Parameters
    ----------
    name: str
        genome name
    provider: str, optional
        only search the specified provider for the genome name
    n: int, optional
        number of lines to show
    """
    for p in online_providers(provider):
        if name in p.genomes:
            tmp_dir = mkdtemp()
            p.head_annotation(name, genomes_dir=tmp_dir, n=n)
            rm_rf(tmp_dir)
示例#11
0
def _apply_fasta_regex_func(infa, regex_func, outfa=None):
    """
    filter a Fasta using the regex function.

    infa: path to genome fasta

    regex_func: a function that takes a contig header and returns a bool

    outfa: path to output fasta. If None, infa is overwritten

    returns a list of excluded contigs
    """
    # move the original file to a tmp folder
    out_dir = os.path.dirname(infa)
    tmp_dir = mkdtemp(dir=out_dir)
    old_fname = os.path.join(tmp_dir, "original") if outfa is None else infa
    new_fname = os.path.join(tmp_dir, "filtered")
    shutil.move(infa, old_fname)

    # perform the filtering
    excluded_contigs = []
    keep_contig = True
    with open(old_fname) as old, open(new_fname, "w") as new:
        for line in tqdm(old, desc="Filtering Fasta", unit_scale=1, unit=" lines"):
            if line[0] == ">":
                keep_contig = regex_func(line)
                if keep_contig is False:
                    excluded_contigs.append(line[1:].split(" ")[0].strip())
            if keep_contig:
                new.write(line)

    # move the filtered file to the original folder
    shutil.move(new_fname, outfa if outfa else infa)
    rm_rf(tmp_dir)

    return excluded_contigs
示例#12
0
def download_annotation(genomes_dir, annot_url, localname, n=None):
    """download annotation file, convert to intermediate file and generate output files"""

    # create output directory if missing
    out_dir = os.path.join(genomes_dir, localname)
    mkdir_p(out_dir)

    # download to tmp dir. Move genome on completion.
    # tmp dir is in genome_dir to prevent moving the genome between disks
    tmp_dir = mkdtemp(dir=out_dir)
    ext, is_compressed = get_file_info(annot_url)

    annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext)
    tmp_annot_file = os.path.join(tmp_dir, annot_url.split("/")[-1])
    get_file = shutil.copyfile if os.path.exists(annot_url) else download_file
    if n is None:
        get_file(annot_url, tmp_annot_file)
    else:
        download_head(annot_url, tmp_annot_file, n)
        is_compressed = False

    # unzip input file (if needed)
    if is_compressed:
        annot_file = extract_archive(tmp_annot_file, outfile=annot_file)
    else:
        shutil.move(tmp_annot_file, annot_file)

    # generate intermediate file (GenePred)
    pred_file = annot_file.replace(ext, ".gp")
    if "bed" in ext:
        cmd = "bedToGenePred {0} {1}"
    elif "gff" in ext:
        # example annotation: GRCh38.p12 from NCBI
        cmd = "gff3ToGenePred -useName -warnAndContinue {0} {1}"
    elif "gtf" in ext:
        cmd = "gtfToGenePred -genePredExt -allErrors -ignoreGroupsWithoutExons {0} {1}"
    elif "txt" in ext:
        # UCSC annotations only
        with open(annot_file) as f:
            cols = f.readline().split("\t")

        # extract the genePred format columns
        start_col = 1
        for i, col in enumerate(cols):
            if col in ["+", "-"]:
                start_col = i - 1
                break
        end_col = start_col + 10
        cmd = (
            f"""cat {{0}} | cut -f {start_col}-{end_col} | """
            # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes
            +
            """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}"""
        )
    else:
        raise TypeError(f"file type extension {ext} not recognized!")

    if n is None and "gencode" in annot_url:
        rename_contigs(annot_file)

    sp.check_call(cmd.format(annot_file, pred_file), shell=True)

    # generate gzipped gtf file (if required)
    gtf_file = annot_file.replace(ext, ".gtf")
    if "gtf" not in ext:
        cmd = "genePredToGtf -source=genomepy file {0} {1}"
        sp.check_call(cmd.format(pred_file, gtf_file), shell=True)

    # generate gzipped bed file (if required)
    bed_file = annot_file.replace(ext, ".bed")
    if "bed" not in ext:
        cmd = "genePredToBed {0} {1}"
        sp.check_call(cmd.format(pred_file, bed_file), shell=True)

    # transfer the files from the tmpdir to the genome_dir
    for f in [gtf_file, bed_file]:
        src = f
        dst = os.path.join(out_dir, os.path.basename(f))
        shutil.move(src, dst)
    rm_rf(tmp_dir)
示例#13
0
def clean():
    """Remove cached data on providers"""
    my_cache_dir = os.path.join(user_cache_dir("genomepy"), __version__)
    rm_rf(my_cache_dir)
    mkdir_p(my_cache_dir)
    print("All clean!")
示例#14
0
def _delete_extensions(directory: str, exts: list):
    """remove (gzipped) files in a directory matching any given extension"""
    for ext in exts:
        [rm_rf(f) for f in glob_ext_files(directory, ext)]
示例#15
0
    def download_and_generate_annotation(genomes_dir, annot_url, localname):
        """download annotation file, convert to intermediate file and generate output files"""

        # create output directory if missing
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        ext, gz = get_file_info(annot_url)
        annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext)
        download_file(annot_url, annot_file)

        # unzip input file (if needed)
        if gz:
            cmd = "mv {0} {1} && gunzip -f {1}"
            sp.check_call(cmd.format(annot_file, annot_file + ".gz"),
                          shell=True)

        # generate intermediate file (GenePred)
        pred_file = annot_file.replace(ext, ".gp")
        if "bed" in ext:
            cmd = "bedToGenePred {0} {1}"
        elif "gff" in ext:
            cmd = "gff3ToGenePred -geneNameAttr=gene {0} {1}"
        elif "gtf" in ext:
            cmd = "gtfToGenePred -ignoreGroupsWithoutExons {0} {1}"
        elif "txt" in ext:
            # UCSC annotations only
            with open(annot_file) as f:
                cols = f.readline().split("\t")

            # extract the genePred format columns
            start_col = 1
            for i, col in enumerate(cols):
                if col in ["+", "-"]:
                    start_col = i - 1
                    break
            end_col = start_col + 10
            cmd = (
                f"""cat {{0}} | cut -f {start_col}-{end_col} | """
                # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes
                +
                """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}"""
            )
        else:
            raise TypeError(f"file type extension {ext} not recognized!")

        sp.check_call(cmd.format(annot_file, pred_file), shell=True)

        # generate gzipped gtf file (if required)
        gtf_file = annot_file.replace(ext, ".gtf")
        if "gtf" not in ext:
            cmd = "genePredToGtf -source=genomepy file {0} {1} && gzip -f {1}"
            sp.check_call(cmd.format(pred_file, gtf_file), shell=True)

        # generate gzipped bed file (if required)
        bed_file = annot_file.replace(ext, ".bed")
        if "bed" not in ext:
            cmd = "genePredToBed {0} {1} && gzip -f {1}"
            sp.check_call(cmd.format(pred_file, bed_file), shell=True)

        # if input file was gtf/bed, gzip it
        if ext in [".gtf", ".bed"]:
            cmd = "gzip -f {}"
            sp.check_call(cmd.format(annot_file), shell=True)

        # transfer the files from the tmpdir to the genome_dir
        for f in [gtf_file + ".gz", bed_file + ".gz"]:
            src = f
            dst = os.path.join(out_dir, os.path.basename(f))
            shutil.move(src, dst)
        rm_rf(tmp_dir)
示例#16
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        fname = os.path.join(tmp_dir, f"{localname}.fa")

        urlcleanup()
        download_file(link, fname)
        sys.stderr.write(
            "Genome download successful, starting post processing...\n")

        # unzip genome
        if link.endswith(".tar.gz"):
            tar_to_bigfile(fname, fname)
        elif link.endswith(".gz"):
            os.rename(fname, fname + ".gz")
            ret = sp.check_call(["gunzip", "-f", fname])
            if ret != 0:
                raise Exception(f"Error gunzipping genome {fname}")

        def regex_filer(_fname, _regex, _v):
            infa = _fname + "_to_regex"
            os.rename(_fname, infa)
            # filter the fasta and store the output's keys
            keys_out = filter_fasta(infa,
                                    outfa=_fname,
                                    regex=_regex,
                                    v=_v,
                                    force=True).keys()
            keys_in = Fasta(infa).keys()
            return [k for k in keys_in if k not in keys_out]

        not_included = []
        # remove alternative regions
        if not keep_alt:
            not_included.extend(regex_filer(fname, "alt", True))

        # keep/remove user defined regions
        if regex:
            not_included.extend(regex_filer(fname, regex, invert_match))

        # process genome (e.g. masking)
        if hasattr(self, "_post_process_download"):
            self._post_process_download(name=name,
                                        localname=localname,
                                        out_dir=tmp_dir,
                                        mask=mask)

        # bgzip genome if requested
        if bgzip or config.get("bgzip"):
            # bgzip to stdout, track progress, and output to file
            fsize = int(os.path.getsize(fname) * 10**-6)
            cmd = (
                f"bgzip -fc {fname} | "
                f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | "
                f"cat > {fname}.gz")
            ret = sp.check_call(cmd, shell=True)
            if ret != 0:
                raise Exception(f"Error bgzipping {name}. Is tabix installed?")
            fname += ".gz"

        # transfer the genome from the tmpdir to the genome_dir
        src = fname
        dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
        shutil.move(src, dst)
        rm_rf(tmp_dir)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if not keep_alt or regex:
            regex_line = "regex: "
            if not keep_alt:
                regex_line += "'alt' (inverted match)"

            if not keep_alt and regex:
                regex_line += " and "

            if regex:
                regex_line += f"'{regex}'"
                if invert_match:
                    regex_line += " (inverted match)"

            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)