def download_gencode_dir(output_dir): """Download all the required gencode files """ makedir_exist_ok(output_dir) url_template = ("https://github.com/kipoi/models/blob/" "7648d3fd57def50934835b52acadd26bcaaa275c/FactorNet/" "template/dataloader_files/gencode_features/{}?raw=true") # url_template = "https://github.com/uci-cbcl/FactorNet/blob/master/resources/{}?raw=true" fnames = [('cpgisland.bed.gz', 'ac7dc007d7019c05adb7a331d1d6721d'), ('wgEncodeGencodeBasicV19.cds.merged.bed.gz', '4ec9883932932efe87e4adc6c84ced1c'), ('wgEncodeGencodeBasicV19.intron.merged.bed.gz', 'd2db7e3255323d2b5b04e1c0c59ecd2d'), ('wgEncodeGencodeBasicV19.promoter.merged.bed.gz', '48fe1ab3aa0e9f5d11f3e5dfedbd47b6'), ('wgEncodeGencodeBasicV19.utr5.merged.bed.gz', 'de87c14d4ff055226afeb01446aba6e6'), ('wgEncodeGencodeBasicV19.utr3.merged.bed.gz', '8bbe08f5fba86306dfbef56d756856f1')] for fname, md5 in fnames: output_file = os.path.join(output_dir, fname) rf = RemoteFile(url=url_template.format(fname), md5=md5) if not os.path.exists(output_file) or not rf.validate(output_file): rf.get_file(output_file)
def create_conda_run(): """Create conda_run bash script to ~/.kipoi/bin/conda_run NOTE: this should be changed to `conda run` once conda=4.6.0 is released https://github.com/conda/conda/issues/2379 """ from kipoi.config import _kipoi_dir crun = """#!/bin/bash # Run a bash command in a new conda environment set -e # stop on error if [[ $# -lt 2 ]] ; then echo "Usage: " echo " conda_run <conda envrionment> <command> " exit 0 fi env=$1 cmd=${@:2} echo "Running command in env: $env" echo "Command: $cmd" source activate $env $cmd source deactivate $env """ bin_dir = os.path.join(_kipoi_dir, 'bin') makedir_exist_ok(bin_dir) crun_path = os.path.join(bin_dir, 'conda_run') with open(crun_path, 'w') as f: f.write(crun) # make it executable subprocess.call(["chmod", "u+x", crun_path]) return crun_path
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate(mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError("RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists(RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def __init__(self, intervals_file, fasta_file, dnase_file, mappability_file=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile( url= "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join( common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate( mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, RNAseq_PC_file=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile( url= "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join( common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate( mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join( this_dir, "../../template/downloaded/dataloader_files/gencode_features/") else: gp = GENCODE_dir download_gencode_dir(gp) # download files self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt) # Get the metadata features if cell_line is None: if RNAseq_PC_file is None: raise ValueError( "RNAseq_PC_file has to be specified when cell_line=None") assert os.path.exists(RNAseq_PC_file) else: # Using the pre-defined cell-line output_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/") makedir_exist_ok(output_dir) RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt") url_template = ( 'https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/' 'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt') # rf = RemoteFile(url=url_template.format(cell_line)) if not os.path.exists( RNAseq_PC_file): # or not rf.validate(mappability_file): # download the path download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt") # rf.get_file(RNAseq_PC_file) self.meta_feat = pd.read_csv(RNAseq_PC_file, sep="\t", header=None)[0].values
def __init__(self, intervals_file, fasta_file, dnase_file, cell_line=None, mappability_file=None, GENCODE_dir=None, use_linecache=True): # intervals if use_linecache: linecache.clearcache() BT = BedToolLinecache else: BT = BedTool self.bt = BT(intervals_file) # Fasta self.fasta_file = fasta_file self.fasta_extractor = None # initialize later # DNase self.dnase_file = dnase_file self.dnase_extractor = None # mappability if mappability_file is None: # download the mappability file if not existing common_dl_dir = os.path.join( this_dir, "../../template/downloaded/dataloader_files") makedir_exist_ok(common_dl_dir) rf = RemoteFile( url= "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig", md5="1d15ddafe2c8df51cf08495db96679e7") mappability_file = os.path.join( common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig") if not os.path.exists(mappability_file) or not rf.validate( mappability_file): # download the path rf.get_file(mappability_file) self.mappability_file = mappability_file self.mappability_extractor = None # Gencode features if GENCODE_dir is None: gp = os.path.join( this_dir, "../../template/downloaded/dataloader_files/gencode_features/") else: gp = GENCODE_dir download_gencode_dir(gp) # download files self.gencode_beds = [ ("cpg", BedTool(gp + '/cpgisland.bed.gz')), ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')), ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')), ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')), ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')), ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')), ] # Overlap beds - could be done incrementally print("Overlapping all the bed-files") # The BT() and .fn are there in order to leverage BedToolLinecache self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn)) for b, v in self.gencode_beds] print("Assesing the file") assert len(self.overlap_beds[1][1]) == len(self.bt)
def singularity_pull(remote_path, local_path): """Run `singularity pull` Args: remote_path: singularity remote path. Example: shub://kipoi/models:latest local_path: local file path to the ".sif" file """ makedir_exist_ok(os.path.dirname(local_path)) if os.path.exists(local_path): logger.info( "Container file {} already exists. Skipping `singularity pull`". format(local_path)) else: if os.environ.get('SINGULARITY_CACHEDIR'): downloaded_path = os.path.join( os.environ.get('SINGULARITY_CACHEDIR'), os.path.basename(local_path)) pull_dir = os.path.dirname(downloaded_path) logger.info("SINGULARITY_CACHEDIR is set to {}".format( os.environ.get('SINGULARITY_CACHEDIR'))) if os.path.exists(downloaded_path): logger.info( "Container file {} already exists. Skipping `singularity pull` and softlinking it" .format(downloaded_path)) if os.path.islink(local_path): logger.info( "Softlink {} already exists. Removing it".format( local_path)) os.remove(local_path) logger.info( "Soflinking the downloaded file: ln -s {} {}".format( downloaded_path, local_path)) os.symlink(downloaded_path, local_path) return None else: pull_dir = os.path.dirname(local_path) logger.info( "Container file {} doesn't exist. Pulling the container from {}. Saving it to: {}" .format(local_path, remote_path, pull_dir)) cmd = [ 'singularity', 'pull', '--name', os.path.basename(local_path), remote_path ] logger.info(" ".join(cmd)) returncode = subprocess.call(cmd, cwd=pull_dir) if returncode != 0: raise ValueError("Command: {} failed".format(" ".join(cmd))) # softlink it if os.environ.get('SINGULARITY_CACHEDIR'): if os.path.islink(local_path): logger.info("Softlink {} already exists. Removing it".format( local_path)) os.remove(local_path) logger.info("Soflinking the downloaded file: ln -s {} {}".format( downloaded_path, local_path)) os.symlink(downloaded_path, local_path) if not os.path.exists(local_path): raise ValueError( "Container doesn't exist at the download path: {}".format( local_path))