Пример #1
0
def download_gencode_dir(output_dir):
    """Download all the required gencode files
    """
    makedir_exist_ok(output_dir)
    url_template = ("https://github.com/kipoi/models/blob/"
                    "7648d3fd57def50934835b52acadd26bcaaa275c/FactorNet/"
                    "template/dataloader_files/gencode_features/{}?raw=true")

    # url_template = "https://github.com/uci-cbcl/FactorNet/blob/master/resources/{}?raw=true"
    fnames = [('cpgisland.bed.gz', 'ac7dc007d7019c05adb7a331d1d6721d'),
              ('wgEncodeGencodeBasicV19.cds.merged.bed.gz',
               '4ec9883932932efe87e4adc6c84ced1c'),
              ('wgEncodeGencodeBasicV19.intron.merged.bed.gz',
               'd2db7e3255323d2b5b04e1c0c59ecd2d'),
              ('wgEncodeGencodeBasicV19.promoter.merged.bed.gz',
               '48fe1ab3aa0e9f5d11f3e5dfedbd47b6'),
              ('wgEncodeGencodeBasicV19.utr5.merged.bed.gz',
               'de87c14d4ff055226afeb01446aba6e6'),
              ('wgEncodeGencodeBasicV19.utr3.merged.bed.gz',
               '8bbe08f5fba86306dfbef56d756856f1')]
    for fname, md5 in fnames:
        output_file = os.path.join(output_dir, fname)
        rf = RemoteFile(url=url_template.format(fname), md5=md5)
        if not os.path.exists(output_file) or not rf.validate(output_file):
            rf.get_file(output_file)
Пример #2
0
def create_conda_run():
    """Create conda_run bash script to ~/.kipoi/bin/conda_run

    NOTE: this should be changed to `conda run` once conda=4.6.0 is released
    https://github.com/conda/conda/issues/2379
    """
    from kipoi.config import _kipoi_dir
    crun = """#!/bin/bash
# Run a bash command in a new conda environment
set -e # stop on error

if [[ $# -lt 2 ]] ; then
    echo "Usage: "
    echo "       conda_run <conda envrionment> <command> "
    exit 0
fi

env=$1
cmd=${@:2}
echo "Running command in env: $env"
echo "Command: $cmd"

source activate $env
$cmd
source deactivate $env
"""
    bin_dir = os.path.join(_kipoi_dir, 'bin')
    makedir_exist_ok(bin_dir)
    crun_path = os.path.join(bin_dir, 'conda_run')
    with open(crun_path, 'w') as f:
        f.write(crun)

    # make it executable
    subprocess.call(["chmod", "u+x", crun_path])
    return crun_path
Пример #3
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                            md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError("RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/")
            makedir_exist_ok(output_dir)
            RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt")
            url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/'
                            'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt')
            # rf = RemoteFile(url=url_template.format(cell_line))
            if not os.path.exists(RNAseq_PC_file):  # or not rf.validate(mappability_file):
                # download the path
                download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt")
                # rf.get_file(RNAseq_PC_file)

        self.meta_feat = pd.read_csv(RNAseq_PC_file,
                                     sep="\t", header=None)[0].values
Пример #4
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 mappability_file=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(
                this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(
                url=
                "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(
                common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(
                    mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
Пример #5
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 GENCODE_dir=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(
                this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(
                url=
                "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(
                common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(
                    mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
        # Gencode features
        if GENCODE_dir is None:
            gp = os.path.join(
                this_dir,
                "../../template/downloaded/dataloader_files/gencode_features/")
        else:
            gp = GENCODE_dir

        download_gencode_dir(gp)  # download files
        self.gencode_beds = [
            ("cpg", BedTool(gp + '/cpgisland.bed.gz')),
            ("cds",
             BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')),
            ("intron",
             BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')),
            ("promoter",
             BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')),
            ("utr5",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')),
            ("utr3",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')),
        ]
        # Overlap beds - could be done incrementally
        print("Overlapping all the bed-files")
        # The BT() and .fn are there in order to leverage BedToolLinecache
        self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn))
                             for b, v in self.gencode_beds]
        print("Assesing the file")
        assert len(self.overlap_beds[1][1]) == len(self.bt)
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError(
                    "RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            output_dir = os.path.join(
                this_dir,
                "../../template/downloaded/dataloader_files/RNAseq_features/")
            makedir_exist_ok(output_dir)
            RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt")
            url_template = (
                'https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/'
                'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt')
            # rf = RemoteFile(url=url_template.format(cell_line))
            if not os.path.exists(
                    RNAseq_PC_file):  # or not rf.validate(mappability_file):
                # download the path
                download_url(url_template.format(cell_line),
                             os.path.join(output_dir, cell_line), "meta.txt")
                # rf.get_file(RNAseq_PC_file)

        self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t",
                                     header=None)[0].values
Пример #6
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 mappability_file=None,
                 GENCODE_dir=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(
                this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(
                url=
                "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(
                common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(
                    mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
        # Gencode features
        if GENCODE_dir is None:
            gp = os.path.join(
                this_dir,
                "../../template/downloaded/dataloader_files/gencode_features/")
        else:
            gp = GENCODE_dir

        download_gencode_dir(gp)  # download files
        self.gencode_beds = [
            ("cpg", BedTool(gp + '/cpgisland.bed.gz')),
            ("cds",
             BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')),
            ("intron",
             BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')),
            ("promoter",
             BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')),
            ("utr5",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')),
            ("utr3",
             BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')),
        ]
        # Overlap beds - could be done incrementally
        print("Overlapping all the bed-files")
        # The BT() and .fn are there in order to leverage BedToolLinecache
        self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn))
                             for b, v in self.gencode_beds]
        print("Assesing the file")
        assert len(self.overlap_beds[1][1]) == len(self.bt)
Пример #7
0
def singularity_pull(remote_path, local_path):
    """Run `singularity pull`

    Args:
      remote_path: singularity remote path. Example: shub://kipoi/models:latest
      local_path: local file path to the ".sif" file
    """
    makedir_exist_ok(os.path.dirname(local_path))
    if os.path.exists(local_path):
        logger.info(
            "Container file {} already exists. Skipping `singularity pull`".
            format(local_path))
    else:
        if os.environ.get('SINGULARITY_CACHEDIR'):
            downloaded_path = os.path.join(
                os.environ.get('SINGULARITY_CACHEDIR'),
                os.path.basename(local_path))
            pull_dir = os.path.dirname(downloaded_path)
            logger.info("SINGULARITY_CACHEDIR is set to {}".format(
                os.environ.get('SINGULARITY_CACHEDIR')))
            if os.path.exists(downloaded_path):
                logger.info(
                    "Container file {} already exists. Skipping `singularity pull` and softlinking it"
                    .format(downloaded_path))
                if os.path.islink(local_path):
                    logger.info(
                        "Softlink {} already exists. Removing it".format(
                            local_path))
                    os.remove(local_path)

                logger.info(
                    "Soflinking the downloaded file: ln -s {} {}".format(
                        downloaded_path, local_path))
                os.symlink(downloaded_path, local_path)
                return None
        else:
            pull_dir = os.path.dirname(local_path)

        logger.info(
            "Container file {} doesn't exist. Pulling the container from {}. Saving it to: {}"
            .format(local_path, remote_path, pull_dir))
        cmd = [
            'singularity', 'pull', '--name',
            os.path.basename(local_path), remote_path
        ]
        logger.info(" ".join(cmd))
        returncode = subprocess.call(cmd, cwd=pull_dir)
        if returncode != 0:
            raise ValueError("Command: {} failed".format(" ".join(cmd)))

        # softlink it
        if os.environ.get('SINGULARITY_CACHEDIR'):
            if os.path.islink(local_path):
                logger.info("Softlink {} already exists. Removing it".format(
                    local_path))
                os.remove(local_path)
            logger.info("Soflinking the downloaded file: ln -s {} {}".format(
                downloaded_path, local_path))
            os.symlink(downloaded_path, local_path)

        if not os.path.exists(local_path):
            raise ValueError(
                "Container doesn't exist at the download path: {}".format(
                    local_path))