def downloadFiles(url): if not os.path.exists(FILE_TMP_FOLDER): os.makedirs(FILE_TMP_FOLDER) else: print("[INFO] Clearing temporary download directory") for the_file in os.listdir(FILE_TMP_FOLDER): file_path = os.path.join(FILE_TMP_FOLDER, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) fileList = [] filename = FILE_TMP_FOLDER + "/downloaded.fa.gz" print("[INFO] Downloading file {} from remote source".format(url)) urllib.urlretrieve(FILE_TO_DOWNLOAD, filename) print("[INFO] Unpacking gz archive") gunzip(filename) for file in os.listdir(FILE_TMP_FOLDER): if file.endswith(".fa"): fileList.append(os.path.join(FILE_TMP_FOLDER, file)) print("[INFO] Detected {} fasta files".format(len(fileList))) return fileList
def genome_download(name, output_path): path = ''.join([output_path + name.replace(" ", "_")]) os.makedirs(path) ngd.download(group="bacteria", genus=name, file_format="fasta", parallel=10, dry_run=True) ngd.download(group="bacteria", genus=name, file_format="fasta", parallel=10, dry_run=False, output=path) files = [] for r, d, f in os.walk(path): for file in f: if '.gz' in file: files.append(os.path.join(r, file)) for f in files: sh.gunzip(f) files2 = [] for r, d, f in os.walk(path): for file in f: if '.fna' in file: files2.append(os.path.join(r, file)) out = ''.join([output_path + "/" + name.replace(" ", "_") + ".fasta"]) sh.cat(files2, _out=out) return path
def download_epgs(result): logger.info('download_epgs()') index = 1 downloaded_list = [] for url in tv_epg_urls: file_result = [] file_result.append("epg #" + str(index)) try: file_name = 'epg-' + str(index) + '.xml.gz' file_name = download_file(url, file_name, file_result) if file_name.endswith('.gz'): xml_file_name = file_name.replace('.gz', '') if os.path.exists(xml_file_name): os.remove(xml_file_name) gunzip(file_name) file_name = xml_file_name downloaded_list.append(file_name) result.append(file_result[0] + ", " + file_result[1] + ": " + sizeof_fmt(os.path.getsize(file_name))) logger.info('download_epg done, xml size: %s', sizeof_fmt(os.path.getsize(file_name))) except Exception as e: logger.error('ERROR in download_epg %s', e) print(e) index = index + 1 return downloaded_list
def regrid_time_series_afni(input, n=2): """ This function upsamples a time series using the afni function 3dUpsample. Before running the function, set the afni environment by calling AFNI in the terminal. Output is an upsampled nifti time series. Inputs: *input: time series filename. *n: upsampling factor. created by Daniel Haenelt Date created: 20-09-2019 Last modified: 19-02-2020 """ import os from sh import gunzip clean_unzip = 0 if os.path.splitext(input)[1] == ".gz": gunzip(input) clean_unzip = 1 input = os.path.splitext(input)[0] # prepare path and filename path_file = os.path.dirname(input) name_file = os.path.splitext(os.path.basename(input))[0] # upsample vaso and bold time series os.system("3dUpsample -overwrite -datum short " + \ "-prefix " + os.path.join(path_file,name_file + "_upsampled.nii") + \ " -n " + str(n) + " -input " + input) if clean_unzip: os.remove(input)
def download_file(language, position): location = "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{}.zip".format(language) # Get the file from the server requested_file = urlopen(location) total_length = requested_file.headers.get('content-length') if total_length: total_length = int(total_length) blocksize = max(4096, total_length//100) else: blocksize = 1000000 with open("/tmp/{}.zip".format(language), "wb") as tempzip: with tqdm(total=total_length, position=position) as tq: tq.set_description("Downloading {}".format(language)) while True: data = requested_file.read(blocksize) if not data: break tempzip.write(data) tq.update(blocksize) with ZipFile("/tmp/{}.zip".format(language)) as zf: zf.extractall(path="./data/") # Finally delete the temp file os.remove("/tmp/{}.zip".format(language)) # Get all of the zipped files and extract them files = [] for file in glob.glob("data/{}/**/*.gz".format(language), recursive=True): files.append(file) with tqdm(total=len(files), position=position) as tq: tq.set_description("Unzipping {}".format(language)) for i in files: gunzip(i) tq.update(1) tq.close()
def disGenData(): disGenDataDic={} disgenFileName='all_gene_disease_associations.tsv' disGenURL="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz" filepath = os.getcwd() print("Extracting DisGen data, job starts",str(datetime.datetime.now())) try: urllib.urlretrieve(disGenURL,filepath+'/all_gene_disease_associations.tsv.gz') urllib.urlcleanup() print("Extracting DisGen data, job done",str(datetime.datetime.now())) except: print ("Can't able to download all_gene_disease_associations.tsv.gz file!!") if os.path.exists(filepath+'/all_gene_disease_associations.tsv'): os.remove(filepath+'/all_gene_disease_associations.tsv') print("Extracting .gz data, job starts",str(datetime.datetime.now())) gunzip(filepath+'/all_gene_disease_associations.tsv.gz') print("Extracting .gz data, job done",str(datetime.datetime.now())) disgendf= pd.read_csv(disgenFileName, delimiter='\t') disGenList=list(disgendf['geneSymbol'].unique()) for gene in disGenList: #tempDisGenID=list(disgendf['geneId'][disgendf['geneSymbol']==gene].unique())[0] tempDF=disgendf[['diseaseName','diseaseId']][(disgendf['geneSymbol']==gene) & (disgendf['diseaseType'] =='disease')] tempDisNamesInfo=list(zip(tempDF['diseaseName'],tempDF['diseaseId'])) tempDisNames=[i[0] for i in tempDisNamesInfo] tempDisNamesURL=['<a target="_blank" href="https://www.disgenet.org/search/0/'+i[1]+'">'+i[0]+'</a>' for i in tempDisNamesInfo] tempDisNames=map(str,tempDisNames) if len(tempDisNamesInfo)>0: disGenDataDic[gene]=[tempDisNames,tempDisNamesURL] dicfile='disGen.obj' dicf = open(dicfile, 'wb') pickle.dump(disGenDataDic, dicf , pickle.HIGHEST_PROTOCOL) dicf.close() return dicfile
def download_trace(file_count): download_file(TRACE_DIR, SHA_FILE_NAME) with open(path.join(TRACE_DIR, SHA_FILE_NAME)) as csv_file: csv_reader = csv.reader(csv_file, delimiter=' ') line_count = 0 for row in csv_reader: trace_file_name = row[1] trace_file_name = trace_file_name.replace("*", "") splits = trace_file_name.split("/") if len(splits) > 0: _type = splits[0] if _type not in file_type: file_type[_type] = 0 try: os.mkdir(path.join(TRACE_DIR, _type)) except OSError as error: print("path %s already exists" % (path.join(TRACE_DIR, _type))) file_type[_type] = file_type[_type] + 1 download_file(path.join(TRACE_DIR, _type), trace_file_name) if trace_file_name.find(".gz") != -1: gunzip(path.join(TRACE_DIR, trace_file_name)) line_count = line_count + 1 if file_count != 0 and line_count >= file_count: break print_trace_info() return
def get_log_file(): """Retrieve and extract log file data""" input_log_file = os.path.basename(FLAGS.data_url) try: urllib.request.urlretrieve(FLAGS.data_url, input_log_file) gunzip(input_log_file) return os.path.splitext(input_log_file)[0] except Exception as e: print('Exception {} when retrieving/extracting log file'.format(e))
def get_data(self, url): gz_file_name = url.split("/")[-1] gz_file_path = os.path.join(self.dir_name, gz_file_name) file_name = gz_file_name.split(".")[0] file_path = os.path.join(self.dir_name, file_name) os.makedirs(self.dir_name, exist_ok=True) if not os.path.exists(file_path): urllib.request.urlretrieve(url, gz_file_path) gunzip(gz_file_path) return file_path
def __getitem__(self, idx): folder = self.brats[idx] flair = self.brats[idx] + '/' + self.brats[idx][54:] + '_flair.nii.gz' t1 = self.brats[idx] + '/' + self.brats[idx][54:] + '_t1.nii.gz' t1ce = self.brats[idx] + '/' + self.brats[idx][54:] + '_t1ce.nii.gz' t2 = self.brats[idx] + '/' + self.brats[idx][54:] + '_t2.nii.gz' seg = self.brats[idx] + '/' + self.brats[idx][54:] + '_seg.nii.gz' # unzip if not already unzipped try: gunzip(flair) except: pass try: gunzip(t1) except: pass try: gunzip(t1ce) except: pass try: gunzip(t2) except: pass try: gunzip(seg) except: pass flair = self.brats[idx] + '/' + self.brats[idx][ 54:] + '_flair.nii' # image files location t1 = self.brats[idx] + '/' + self.brats[idx][54:] + '_t1.nii' t1ce = self.brats[idx] + '/' + self.brats[idx][54:] + '_t1ce.nii' t2 = self.brats[idx] + '/' + self.brats[idx][54:] + '_t2.nii' seg = self.brats[idx] + '/' + self.brats[idx][54:] + '_seg.nii' img, seg, seg_orig = read_img(flair=flair, t1=t1, t1ce=t1ce, t2=t2, seg=seg) sample = { 'img': img, 'mask': seg.type(torch.ByteTensor), 'seg_orig': seg_orig.type(torch.ByteTensor) } if self.transform: sample = self.transform(sample) return sample
def postClone(self, cloned_files, target_dir, version): """ Extracts the downloaded assembly. .. versionadded:: 0.3.0 """ # GZIP and TAR the file and save to the target directory for f in cloned_files: sh.gunzip(f) return 0
def postClone(self, cloned_files, target_dir, version): """ Extracts the compressed archives. .. versionadded:: 0.3.0 """ # GZIP the files (and remove the archive) for f in cloned_files: # Only some of the files needs to be gzipped if f.endswith(".gz"): sh.gunzip(f)
def postClone(self, cloned_files, target_dir, version): """ .. versionadded:: 0.3.0 """ # Start by extracting all the files for f in cloned_files: # GunZIP the file (and remove the archive) sh.gunzip(f) # Then let's concat them target_path = "{}/NCBI.Homo_sapiens.fa".format(target_dir) # Remove ".gz" ending to point to extracted files cat_args = [f[:-3] for f in cloned_files] # Execute the concatenation in the background and write to the target path sh.cat(*cat_args, _out=target_path, _bg=True)
def make_fastq_list(directory): fastqs = [] # make sure the dir exists if not os.path.isdir(directory): log.warn("%s is not a valid dir, exiting", directory) raise SystemExit directory = os.path.abspath(directory) log.info("Reading %s for fastqs", directory) # see if there are any compressed files gz_blob = os.path.join(directory, "*.fastq.gz") gzs = glob.glob(gz_blob) for gz in gzs: log.info("gunzipping %s", gz) # 0 == all good, and bool false if sh.gunzip(gz): log.warn("gunzipping %s failed, exiting", gz) raise SystemExit # now glob the fastqs blob = os.path.join(directory, "*.fastq") fastqs.extend(glob.glob(blob)) # make sure we got stuff if len(fastqs) == 0: log.warn("Fastq list is empty, exiting") raise SystemExit return fastqs
def postClone(self, cloned_files, target_dir, version): """ .. versionadded:: 0.3.0 """ # Start by extracting all the files for f in cloned_files: # GunZIP the file (and remove the archive) sh.gunzip(f) # Then let's concat them target_path = "{}/Genbank.Homo_sapiens.fa".format(target_dir) # Remove ".gz" ending to point to extracted files cat_args = [f[:-3] for f in cloned_files] # Execute the concatenation in the background and write to the target path sh.cat(*cat_args, _out=target_path)
def make_fastq_list(directory): fastqs = [] # make sure the dir exists if not os.path.isdir(directory): log.warn( "%s is not a valid dir, exiting", directory) raise SystemExit directory = os.path.abspath(directory) log.info("Reading %s for fastqs", directory) # see if there are any compressed files gz_blob = os.path.join(directory, "*.fastq.gz") gzs = glob.glob(gz_blob) for gz in gzs: log.info("gunzipping %s", gz) # 0 == all good, and bool false if sh.gunzip(gz): log.warn("gunzipping %s failed, exiting", gz) raise SystemExit # now glob the fastqs blob = os.path.join(directory, "*.fastq") fastqs.extend(glob.glob(blob)) # make sure we got stuff if len(fastqs) == 0: log.warn("Fastq list is empty, exiting") raise SystemExit return fastqs
def unzip_all(dpath = '/home/elmirakh/sat_images/'): """Массовая распаковка""" zip_dirs = [dpath + x + '/' for x in os.listdir(dpath)] cnt = 0 for zdir in tqdm(zip_dirs): for path_to_zip_file in os.listdir(zdir): if not path_to_zip_file.endswith('.gz'): continue path_to_zip_file = os.path.join(zdir,path_to_zip_file) try: gunzip(path_to_zip_file) cnt += 1 except ErrorReturnCode: tqdm.write(path_to_zip_file) os.remove(path_to_zip_file) tqdm.write('Unzipped {} files succesfully'.format(cnt)) return
def encsr_encff(args): encff_time = time.time() filePath = './bedAuto/jsonENCSR/' encsrDict = dict() encffNames = [] encffLinks = [] encsrNameList, encsrLinkList = parseSearch(args[0]) # getting encffs from jsons of encsrs # only encffs matching below criteria will be extracted from jsons of encsrs # file type: bed narrowPeak, output type: replicated peaks, assembly: GRCh38 for encsrName, encsrLink in zip(encsrNameList, encsrLinkList): r = requests.get(encsrLink, allow_redirects=True) filePathCurrent = filePath + encsrName + '.json' with open(filePathCurrent, 'wb') as f: f.write(r.content) with open(filePathCurrent, 'r') as f: encsrDict[encsrName] = json.load(f) infoFiltered = encsrDict[encsrName]['files'] for element in range(len(infoFiltered)): if infoFiltered[element][ 'file_type'] == 'bed narrowPeak' and infoFiltered[element][ 'output_type'] == 'replicated peaks': if infoFiltered[element]['assembly'] == 'GRCh38': encffNames.append( infoFiltered[element]['cloud_metadata']['url'][-18:]) encffLinks.append( infoFiltered[element]['cloud_metadata']['url']) print('parse for encff --- %.2f seconds ---' % (time.time() - encff_time)) download_time = time.time() encffPath = './bedAuto/filesBed/' # downloading and unzipping bed files for name, link in zip(encffNames, encffLinks): r = requests.get(link, allow_redirects=True) encffPathCurrent = encffPath + name with open(encffPathCurrent, 'wb') as f: f.write(r.content) gunzip(encffPathCurrent) print('download and unzip bed files --- %.2f seconds ---' % (time.time() - download_time))
def downloadUniprotIDMapping(): idmppaingfilepath = os.getcwd() print("Extracting mapping data, job starts", str(datetime.datetime.now())) try: urllib.urlretrieve( 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.2015_03.gz', idmppaingfilepath + '/idmapping.dat.2015_03.gz') urllib.urlcleanup() print("Extracting mapping data, job done", str(datetime.datetime.now())) except: print("Can't able to download idmapping.dat.2015_03.gz file!!") if os.path.exists(idmppaingfilepath + '/idmapping.dat.2015_03'): os.remove(idmppaingfilepath + '/idmapping.dat.2015_03') print("Extracting .gz data, job starts", str(datetime.datetime.now())) gunzip(idmppaingfilepath + '/idmapping.dat.2015_03.gz') print("Extracting .gz data, job done", str(datetime.datetime.now()))
def postClone(self, cloned_files, target_dir, version): """ Extracts the compressed archives. .. versionadded:: 0.3.0 """ f = cloned_files[0] if self.newer("hg18", version): # GZIP and TAR the file and save to the target directory sh.tar("-xzf", f, "-C", target_dir) else: # Rename to ".zip" sh.mv(f, f.replace("tar.gz", "zip")) # GunZIP the file (and remove the archive) sh.gunzip(f)
def run(indir, rbs, id_lst): assert hasattr(indir, 'mkdir_p') assert hasattr(rbs, 'mkdir_p') assert hasattr(id_lst, 'mkdir_p') names_and_ids = list(csv.DictReader(open(id_lst), delimiter=',')) snames, sids = zip(*[(v['SampleName'], v['IssueID']) for v in names_and_ids]) for sid in sids: #(rbs / sid).mkdir_p() print 'mkdir', rbs / sid mkdir_p(rbs / sid) old_and_new = newnames_by_dict(indir, rbs, snames, sids) for old, new in old_and_new: print 'gunzip', old print 'symlink', old.stripext(), new if not os.path.exists(old.stripext()): assert os.path.exists(old) sh.gunzip(old) os.symlink(old.stripext(), new)
def upsample_volume(file_in, file_out, dxyz=[0.4, 0.4, 0.4], rmode="Cu"): """ This function upsamples a nifti volume using the afni function 3dresample. Before running the function, set the afni environment by calling AFNI in the terminal. Output is an upsampled nifti volume. Inputs: *file_in: nifti input filename. *file_out: nifti output filename. *dxyz: array of target resolution in single dimensions. *rmode: interpolation methods (Linear, NN, Cu, Bk). created by Daniel Haenelt Date created: 16-12-2019 Last modified: 29-05-2020 """ import os import numpy as np from sh import gunzip from shutil import copyfile from lib.io.get_filename import get_filename # get path and file extension of input file path_in, _, ext_in = get_filename(file_in) # make temporary copy of input file tmp = np.random.randint(0, 10, 5) tmp_string = ''.join(str(i) for i in tmp) file_tmp = os.path.join(path_in, "tmp_" + tmp_string + ext_in) copyfile(file_in, file_tmp) if os.path.splitext(file_tmp)[1] == ".gz": gunzip(file_tmp) file_tmp = os.path.splitext(file_tmp)[0] # upsample volume os.system("3dresample " + \ "-dxyz " + str(dxyz[0]) + " " + str(dxyz[1]) + " " + str(dxyz[2]) + " " +\ "-rmode " + str(rmode) + " " + \ "-inset " + file_tmp + " " + \ "-prefix " + file_out) # remove temporary copy os.remove(file_tmp)
def download_file(url, filename): csvfilename = os.path.splitext(os.path.basename(filename))[0] if not os.path.isfile(csvfilename): print('Downloading File') response = requests.get(url) if response.status_code == 200: with open(filename, 'wb') as file: for chunk in response: file.write(chunk) gunzip(filename) return (csvfilename) else: print('File exists') return (csvfilename)
def download_epg(self, index, url, downloaded_list): self.logger.info("download_epg(%s)" % url) start_time = time.time() file_name = 'epg-' + str(index) + '.xml.gz' try: file_name = self.download_file(url, file_name) if file_name.endswith('.gz'): xml_file_name = file_name.replace('.gz', '') if os.path.exists(xml_file_name): os.remove(xml_file_name) gunzip(file_name) file_name = xml_file_name downloaded_list.append(file_name) except Exception as e: self.logger.error('ERROR in download_epg(%s) %s' % (url, e)) self.logger.info("download_epg(%s), xml size: %s, time: %sms" % (url, self.sizeof_fmt(os.path.getsize(file_name)), time.time() - start_time))
def _unzip_clean(self, hmlfile): """ Sets the typing of this Sample. :param typing: The typing of this Sample. :type typing: List[Typing] """ gunzip(hmlfile) hml_unzipped = ".".join(hmlfile.split(".")[0:len(hmlfile.split("."))-1]) cmd = "perl -p -i -e 's/<\?X-NMDP-CORRECTION TRUE\?><\?X-NMDP-NOREPORTS\?>//g' " + hml_unzipped os.system(cmd) cmd4 = "perl -p -i -e 's/<\?xml.+\?>//g' " + hml_unzipped os.system(cmd4) cmd1 = "perl -p -i -e 's/\?//g' " + hml_unzipped os.system(cmd1) cmd2 = "perl -p -i -e 's/ns2://g' " + hml_unzipped os.system(cmd2) cmd3 = "perl -p -i -e 's/:ns2//g' " + hml_unzipped os.system(cmd3) return hml_unzipped
def fetch_encsr_encff(args): fetch_encsr_time = time.time() file_path = './bedAuto/jsonENCSR/' file_path_all = [] encff_names = [] encff_links = [] encsr_names, encsr_links = parseSearch(args[0]) for encsr_name, encsr_link in zip(encsr_names, encsr_links): r = requests.get(encsr_link, allow_redirects=True) file_path_current = file_path + encsr_name + '.json' file_path_all.append(file_path_current) with open(file_path_current, 'wb') as f: f.write(r.content) num_processes = len(file_path_all) with Pool(num_processes) as p: encff_names_links = p.map(multi_encsr_encff, file_path_all) print('parse for encff --- %.2f seconds ---' % (time.time() - fetch_encsr_time)) download_time = time.time() encff_path = './bedAuto/filesBed/' for encff_name, encff_link in encff_names_links: encff_names.append(str(encff_name)[2:-2]) encff_links.append(str(encff_link)[2:-2]) # downloading and unzipping bed files for name, link in zip(encff_names, encff_links): r = requests.get(link, allow_redirects=True) encff_path_current = encff_path + name with open(encff_path_current, 'wb') as f: f.write(r.content) gunzip(encff_path_current) print('download and unzip bed files --- %.2f seconds ---' % (time.time() - download_time))
def _setup_mgm(self, ): mgm_location = self._find_binary( name='gmhmmp', options_message='Please select appropriate MetaGeneMark location', raise_if_not_found=MetaGeneMarkNotFound) mod_file = self._find_mgm_mod_file(dirname(mgm_location)) gm_key_home = join(self.HOME, '.gm_key') if not exists(gm_key_home) or not self._is_gm_key_valid(gm_key_home): valid_gm_key = self._find_gm_key() if valid_gm_key.endswith(".gz"): print('Extracting {} to {}'.format(valid_gm_key, gm_key_home)) sh.gunzip(valid_gm_key, '-c', _out=gm_key_home) else: print('Copying {} to {}'.format(valid_gm_key, gm_key_home)) copy(valid_gm_key, gm_key_home) return { 'bin': mgm_location, 'mod_path': mod_file, # 'valid_key': valid_gm_key, }
def decompr_files(dname, fnames): """Runs gunzip on a list of files. Args: dname (str): The Dirname containing files to gunzip. fnames (list): list of filenames in dir. Examples: Pipe.decompr_files('user/inputs/', ['a.gz', 'b.gz']) """ try: for fname in fnames: if os.path.exists(dname + fname): sh.gunzip(dname + fname) elif os.path.exists(dname + os.path.splitext(fname)[0]): return else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dname + fname) except BaseException: raise
def handle_bootimg(filename): global KEEPSTUFF name = getBasename(filename) if (name[:4] in ['boot', 'hosd', 'BOOT'] or name[:8] in ['recovery', 'fastboot', 'RECOVERY'] or name[:9] == 'droidboot' or name[:10] == 'okrecovery' or name[-4:] == '.bin'): subprocess.run([IMGTOOL, filename, 'extract']) os.chdir('extracted') format_ = getFormat('ramdisk') if (format_ == 'LZ4'): subprocess.run(['unlz4', 'ramdisk', 'ramdisk.out'], shell=True) subprocess.run(['cat', 'ramdisk.out', '|', 'cpio', '-i'], shell=True) os.remove('ramdisk.out') elif (format_ == 'gzip'): cpio(gunzip('ramdisk', '-c'), '-i') rm('ramdisk') os.chdir('..') find_output = find('extracted', '-print0').stdout.decode('utf-8').splitlines() for line in find_output: if (os.path.isfile(line)): format_ = getFormat('ramdisk') if (format_ == 'gzip'): mv(line, line, '.gz') gunzip('-f', line, '.gz') result = at_extract(line) else: result = at_extract(line) print(line + "processed: " + result) if (KEEPSTUFF == 1): cp('-r', 'extracted', MY_FULL_DIR + '/' + SUB_DIR + '/' + name) chown('-R', EXTUSER + ':' + EXTGROUP, MY_FULL_DIR + '/' + SUB_DIR + '/' + name) shutil.rmtree("extracted") else: handle_binary(filename)
def download_ftp_data(address, username, password, files): """ """ print('connecting to: ', address, '...') ftp = ftplib.FTP(address) print('logging in...') ftp.login(username, password) for file in files: os.makedirs(os.path.dirname(file[1]), exist_ok=True) if ask_me_every_time: user_input = input(ftp_prompt.format(file[0])) if user_input.lower() != 'y': print(ftp_skipping_prompt.format(file[0])) continue print('downloading: ', file[0], '...') ftp.sendcmd("TYPE i") size = ftp.size(file[0]) p_bar = progressbar.AnimatedProgressBar(end=size, width=10) with open(file[1] + '.gz', 'wb') as f: def callback(chunk): f.write(chunk) p_bar + len(chunk) p_bar.show_progress() ftp.retrbinary("RETR " + file[0], callback) p_bar + size p_bar.show_progress() print() print('extracting...') gunzip(file[1] + '.gz', '-f') # add \ to \t because backward compatability is important with open(file[1], 'r') as f: content = f.read() content.replace('\t', '\\t') with open(file[1], 'w') as f: f.write(content) print('done')
def write_pair_file(debug_path, pair, run_content, run_filename, run_dir): if run_filename.endswith('gz'): local_file_name = f'R{pair}.fastq.gz' else: local_file_name = f'R{pair}.fastq' open_operator = open with open(debug_path, 'a') as f: f.write( f'{run_filename} ({local_file_name}) is being handled with {open_operator}\n' ) local_file_path = os.path.join(run_dir, local_file_name) with open_operator(local_file_path, 'wb') as f: f.write(run_content) #avoid double zipping: if local_file_path.endswith('.gz'): try: sh.gunzip(local_file_path) except: shutil.move(local_file_path, local_file_path[:-3]) pass with open(debug_path, 'a') as f: f.write(f'R{pair} was handled successfully\n')
def ensembl_data_url(gen_ver, species, dir): # Get species specific url gene_url, peptide_url = ensembl_url(species, gen_ver) # Make file if it doesn't exist mkfile(f'{dir}/{species}/ensembl') # DL datasets gene_fn = wget.download(gene_url, f'{dir}/{species}/ensembl') peptide_fn = wget.download(peptide_url, f'{dir}/{species}/ensembl') # unzip gz_f = lambda x: gunzip(x) if '.gz' in x else None gz_f(gene_fn), gz_f(peptide_fn) files = os.listdir(f'{dir}/{species}/ensembl/') #remove gunzip files [ os.remove(os.path.join(f'{dir}/{species}/ensembl/', file)) for file in files if file.endswith(".gz") ] return gene_fn.replace('.gz', ''), peptide_fn.replace('.gz', '')
from sh import gunzip from glob import glob import re, os, json OUTDIR="data-unzipped" if not os.stat(OUTDIR): print "created dir ", OUTDIR os.makedirs(OUTDIR) for dataFile in glob("data/*.gz"): outFile = os.path.join(OUTDIR, re.search('(\d+).gz$', dataFile).groups()[0]) if not os.stat(outFile): gunzip('-c', dataFile, _out=outFile) print "unzipped", dataFile, "to", outFile # then, go through the data files and create a new JSON file that maps # mpId [of route] -> { latitude, longitude, grade, protection } allRoutes = dict() allAreas = dict() for dataFile in glob("data-unzipped/*"): data = json.load(file(dataFile)) allRoutes.update(dict([(route['mpId'], route) for route in data['routes']])) # parentId of a route gives its area. parent of an area is the parent area. areas = dict([(area['id'], area) for area in data['areas']]) allAreas.update(areas) # stupid algorithm to fill in missing latitude/longitude with values for
def un_gzip(self, content): return sh.gunzip(_in=content).stdout