def _download_file(self, url, filename, compress=False, timeout=30): """ Download a file to the resources folder. Download data from `url`, save as `filename`, and optionally compress with gzip. Parameters ---------- url : str URL to download data from filename : str name of file to save; if compress, ensure '.gz' is appended compress : bool compress with gzip timeout : int seconds for timeout of download request Returns ------- str path to downloaded file, empty str if error """ if compress and filename[-3:] != ".gz": filename += ".gz" destination = os.path.join(self._resources_dir, filename) if not create_dir(os.path.relpath(os.path.dirname(destination))): return "" if not os.path.exists(destination): try: # get file if it hasn't already been downloaded # http://stackoverflow.com/a/7244263 with urllib.request.urlopen( url, timeout=timeout) as response, atomic_write( destination, mode="wb") as f: self._print_download_msg(destination) data = response.read() # a `bytes` object if compress: self._write_data_to_gzip(f, data) else: f.write(data) except urllib.error.URLError as err: logger.warning(err) destination = "" # try HTTP if an FTP error occurred if "ftp://" in url: destination = self._download_file( url.replace("ftp://", "http://"), filename, compress=compress, timeout=timeout, ) except socket.timeout: logger.warning(f"Timeout downloading {url}") destination = "" return destination
def _setup_gsa_test(): # reset resource if already loaded temp = SNPs() temp._resources._gsa_resources = {} create_dir("resources") with open("tests/resources/gsa_rsid_map.txt", "rb") as f_in: with atomic_write("resources/gsa_rsid_map.txt.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) with open("tests/resources/gsa_chrpos_map.txt", "rb") as f_in: with atomic_write("resources/gsa_chrpos_map.txt.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip)
def _get_path_assembly_mapping_data(self, source_assembly, target_assembly, retries=10): """ Get local path to assembly mapping data, downloading if necessary. Parameters ---------- source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap from target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap to retries : int number of retries per chromosome to download assembly mapping data Returns ------- str path to <source_assembly>_<target_assembly>.tar.gz References ---------- 1. Ensembl, Assembly Information Endpoint, https://rest.ensembl.org/documentation/info/assembly_info 2. Ensembl, Assembly Map Endpoint, http://rest.ensembl.org/documentation/info/assembly_map """ if not create_dir(self._resources_dir): return "" chroms = [str(i) for i in range(1, 23)] chroms.extend(["X", "Y", "MT"]) assembly_mapping_data = source_assembly + "_" + target_assembly destination = os.path.join(self._resources_dir, assembly_mapping_data + ".tar.gz") if not os.path.exists(destination): logger.info(f"Downloading {os.path.relpath(destination)}") self._download_assembly_mapping_data(destination, chroms, source_assembly, target_assembly, retries) return destination
""" Get a file from the openSNP datadump for debugging. """ import os from atomicwrites import atomic_write from snps.resources import Resources from snps.utils import create_dir OUTPUT_DIR = "output" FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt" if __name__ == "__main__": # create output directory for this example create_dir(OUTPUT_DIR) # assume script is being run from examples dir r = Resources(resources_dir="../../resources") with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f: f.write(r.load_opensnp_datadump_file(FILE))
def _find_shared_dna_output_helper( self, individuals, one_chrom_shared_dna, two_chrom_shared_dna, one_chrom_shared_genes, two_chrom_shared_genes, ): cytobands = self._resources.get_cytoBand_hg19() individuals_filename = "" individuals_plot_title = "" for individual in individuals: individuals_filename += individual.get_var_name() + "_" individuals_plot_title += individual.name + " / " individuals_filename = individuals_filename[:-1] individuals_plot_title = individuals_plot_title[:-3] if create_dir(self._output_dir): plot_chromosomes( one_chrom_shared_dna, two_chrom_shared_dna, cytobands, os.path.join( self._output_dir, "shared_dna_{}.png".format(individuals_filename) ), "{} shared DNA".format(individuals_plot_title), 37, ) if len(one_chrom_shared_dna) > 0: file = "shared_dna_one_chrom_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( one_chrom_shared_dna, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, float_format="%.2f", ) if len(two_chrom_shared_dna) > 0: file = "shared_dna_two_chroms_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( two_chrom_shared_dna, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, float_format="%.2f", ) if len(one_chrom_shared_genes) > 0: file = "shared_genes_one_chrom_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( one_chrom_shared_genes, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, ) if len(two_chrom_shared_genes) > 0: file = "shared_genes_two_chroms_{}_GRCh37.csv".format(individuals_filename) save_df_as_csv( two_chrom_shared_genes, self._output_dir, file, comment=self._get_csv_header(), prepend_info=False, )
def main(): logger.info("start") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() filenames = [ filename for filename in filenames if "readme" not in filename and "phenotype" not in filename ] # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [{"file": filenames[i]} for i in samples] # run tasks; results is a list of dicts results = p(load_file, tasks) # get results from `load_file` where `count` was non-zero rows = [item for item in results if "msg" not in item] df = pd.DataFrame( rows, columns=["file", "source", "build", "build_detected", "chromosomes", "count"], ) save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv") # log parsing statistics file_count = len(filenames) logger.info(f"{file_count} files in the openSNP datadump") logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed") logger.info( f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed" ) # extract files from the datadump where `load_file` returned a message if EXTRACT_FILES: # group files with same message (e.g., {"some message": ["file1", "file2"], ...}) d = {} for result in results: if "msg" in result: if result["msg"] in d: d[result["msg"]].append(result["file"]) else: d[result["msg"]] = [result["file"]] # add messages / file filters as necessary... d["build not detected"] = list(df.loc[~df.build_detected].file.values) # extract files that have messages for debugging for msg, files in d.items(): if len(files) == 0: continue # create a directory for each message (prefix indicates number of files) path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}") create_dir(path) # save each file with message into created directory for filename in files: with atomic_write(os.path.join(path, filename), mode="wb") as f: f.write(r.load_opensnp_datadump_file(filename)) logger.info("stop")
def _get_path_assembly_mapping_data(self, source_assembly, target_assembly, retries=10): """ Get local path to assembly mapping data, downloading if necessary. Parameters ---------- source_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap from target_assembly : {'NCBI36', 'GRCh37', 'GRCh38'} assembly to remap to retries : int number of retries per chromosome to download assembly mapping data Returns ------- str path to <source_assembly>_<target_assembly>.tar.gz References ---------- 1. Ensembl, Assembly Information Endpoint, https://rest.ensembl.org/documentation/info/assembly_info 2. Ensembl, Assembly Map Endpoint, http://rest.ensembl.org/documentation/info/assembly_map """ if not create_dir(self._resources_dir): return "" chroms = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT", ] assembly_mapping_data = source_assembly + "_" + target_assembly destination = os.path.join(self._resources_dir, assembly_mapping_data + ".tar.gz") if not os.path.exists(destination): logger.info("Downloading {}".format(os.path.relpath(destination))) self._download_assembly_mapping_data(destination, chroms, source_assembly, target_assembly, retries) return destination