def single_station_rinex_garner_download(save_dir, minimum_year=None, station='tela'): import htmllistparse import requests import os import logging logger = logging.getLogger('rinex_garner') savepath = save_dir if not os.path.exists(savepath): try: os.makedirs(savepath) logger.info('Creating {} for station {}'.format(savepath, station)) except OSError: logger.error("Creation of the directory %s failed" % savepath) else: logger.info("Successfully created the directory %s" % savepath) else: logger.warning('Folder {} already exists.'.format(savepath)) command = 'http://*****:*****@garner.ucsd.edu/pub/rinex/' cwd, listing = htmllistparse.fetch_listing(command, timeout=30) dirs = [f.name for f in listing if '/' in f.name] if minimum_year is not None: years = [int(x.split('/')[0]) for x in dirs] years = [x for x in years if x >= minimum_year] dirs = [str(x) + '/' for x in years] logger.info('starting search from year {}'.format(minimum_year)) for year in dirs: logger.info(year) cwd, listing = htmllistparse.fetch_listing(command + year, timeout=30) days = [f.name for f in listing if '/' in f.name] for day in days: cwd, listing = htmllistparse.fetch_listing(command + year + day, timeout=30) files = [f.name for f in listing if f.size is not None] found = [f for f in files if station in f] if found: filename = found[0] saved_filename = savepath / filename if saved_filename.is_file(): logger.warning( '{} already exists in {}, skipping...'.format( filename, savepath)) continue logger.info('Downloading {} to {}.'.format(filename, savepath)) r = requests.get(command + year + day + filename) with open(saved_filename, 'wb') as file: file.write(r.content) logger.info('Done downloading station {}.'.format(station)) return
def fetch_image_builds(version): """ Fetch ceph container image builds 1) Search Share path for ceph image under DEFAULT_OSBS_SERVER 2) look for particular RHCS version json files. 3) Sort builds based on timestamp and return builds. todo: Fix when upgrade scenario needs image from source path """ try: cwd, c_list = fetch_listing(DEFAULT_OSBS_SERVER, timeout=60) assert c_list, "Container file(s) not found" c_list = [i for i in c_list if i.endswith("json")] builds = dict() for comp in c_list: if version in comp.name: dt = datetime.datetime.fromtimestamp(mktime( comp.modified)).timestamp() builds.update({dt: comp}) builds = [builds[k] for k in sorted(builds)] return builds except AssertionError as err: logging.warning(err) raise AssertionError( f"Ceph Image builds not found : {DEFAULT_OSBS_SERVER}")
def tarball_probe_remote_versions(source=None): versions = {} if source is None: return versions # Remove everything after the $ (start of variable) if '/$' in source: source = source[:source.index('$')] # Remove the filename else: for ext in TARBALL_SUPPORTED_EXTENSIONS: if source.endswith(ext): filename = source.split('/')[-1] source = source.replace(filename, '') break try: cwd, listings = htmllistparse.fetch_listing(source, timeout=30) for listing in listings: if listing.name.endswith(tuple(TARBALL_SUPPORTED_EXTENSIONS)): ver = SEMVER_PATTERN.search(listing.name) if ver is not None and ver.group(0) not in versions.keys(): versions[ver.group(0)] = listing.name except Exception as e: logger.warn(e) pass print(versions) return versions
def query(self, mgrs, sat=None, year=None, version='v1.4', start_date=None, end_date=None): if sat is None: sat = 'L' sat = sat.upper() assert sat in 'LS' if year is None: year = datetime.datetime.now().year assert isint(year) year = int(year) zone = mgrs[:2] grid = mgrs[2] aa_x, aa_y = tuple(mgrs[3:5]) url = 'https://hls.gsfc.nasa.gov/data/{version}/{sat}30/{year}/{zone}/{grid}/{aa_x}/{aa_y}/'\ .format(version=version, sat=sat, year=year, zone=zone, grid=grid, aa_x=aa_x, aa_y=aa_y) try: cwd, listing = htmllistparse.fetch_listing(url) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: listing = [] warnings.warn(url + ' returned 404') else: raise listing = [item.name for item in listing if item.name.endswith('hdf')] if start_date is not None: startdate_m, startdate_d = map(int, start_date.split('-')) start_jd = (datetime.date(year, startdate_m, startdate_d) - datetime.date(year, 1, 1)).days + 1 listing = [ name for name in listing if int(name.split('.')[3][4:]) >= start_jd ] if end_date is not None: enddate_m, enddate_d = map(int, end_date.split('-')) enddate_jd = (datetime.date(year, enddate_m, enddate_d) - datetime.date(year, 1, 1)).days + 1 listing = [ name for name in listing if int(name.split('.')[3][4:]) <= enddate_jd ] return listing
def file_names(url_or_local, extension): is_local = Path(url_or_local).exists() and Path(url_or_local).is_dir() is_url = is_local == False if is_url: try: cwd, listing = htmllistparse.fetch_listing(url_or_local) except requests.exceptions.HTTPError as err: status_code = err.response.status_code print(status_code) return [], [] else: names = [] for dir in listing: name = dir.name if name.endswith(extension): names.append(name) return cwd, sorted(names) # remove current dir from the subdir list if is_local: cwd = Path(url_or_local) cwd_name = cwd.name names = [] files = [f for f in listdir(url_or_local) if isfile(join(url_or_local, f))] for file in files: name = Path(file).name if name.endswith(extension): names.append(name) return cwd, sorted(names)
def fetch_html_file_list(baseurl, extension): cwd, listing = htmllistparse.fetch_listing(baseurl, timeout=10) result = [ baseurl + "/" + item.name for item in listing if item.name.endswith(extension) ] return result
def fetch(self, version=None): _cwd, listing = htmllistparse.fetch_listing(self.url, params=self.url_params) for e in listing: try: semver = self.__parse_entry(e.name) if version == None or semver > Version(version, partial=True): self.versions.add(semver) except ValueError: print('Ignoring invalid version:', e.name, file=sys.stderr)
def query(self, mgrs, sat='L', year=None, version='v1.4', startdate=None, enddate=None): sat = sat.upper() assert sat in 'LS' if year is None: year = datetime.datetime.now().year assert isint(year) year = int(year) zone = mgrs[:2] grid = mgrs[2] aa_x, aa_y = tuple(mgrs[3:5]) url = 'https://hls.gsfc.nasa.gov/data/{version}/{sat}30/{year}/{zone}/{grid}/{aa_x}/{aa_y}/'\ .format(version=version, sat=sat, year=year, zone=zone, grid=grid, aa_x=aa_x, aa_y=aa_y) cwd, listing = htmllistparse.fetch_listing(url) listing = [item.name for item in listing if item.name.endswith('hdf')] if startdate is not None: startdate_m, startdate_d = map(int, startdate.split('-')) start_jd = (datetime.date(year, startdate_m, startdate_d) - datetime.date(year, 1, 1)).days + 1 listing = [ name for name in listing if int(name.split('.')[3][4:]) >= start_jd ] if enddate is not None: enddate_m, enddate_d = map(int, enddate.split('-')) enddate_jd = (datetime.date(year, enddate_m, enddate_d) - datetime.date(year, 1, 1)).days + 1 listing = [ name for name in listing if int(name.split('.')[3][4:]) <= enddate_jd ] return listing
def print_site(uri, show_all): _, l = htmllistparse.fetch_listing(uri) l = sorted(l, key=lambda i: i.modified, reverse=True) for i in l: # TODO: figure out what the latest 'stable' release is and print # that instead.. probably can use a regex? if not i.name.endswith("/"): continue print(i.name.strip("/")) if not show_all: break
def scan_url(url, regex, name, recursive): time.sleep(1) logging.debug(f"scan_url({url}, {regex}, {name}, {recursive}") output_array = [] try: cwd, listing = htmllistparse.fetch_listing(url, timeout=15) for file_item in listing: if not file_item.size and recursive: scan_url(urljoin(url, file_item.name), regex, name) elif regex.search(file_item.name): file_url = urljoin(config_item["url"], file_item.name) output_array.extend(scan_binary(file_url, file_item, name)) except: logging.error( f"htmllistparse.fetch_listing({url}, timeout=15) returned an exception" ) return output_array
def preview_remote(api: sly.Api, task_id, context, state, app_logger): global listing api.task.set_field(task_id, "data.previewError", "") try: remote_dir = state["remoteDir"] parts = urlparse(remote_dir) project_name = parts.path.rstrip("/") if project_name not in ["", "/"]: project_name = sly.fs.get_file_name(project_name) # last directory name from path else: project_name = "" cwd, raw_listing = htmllistparse.fetch_listing(remote_dir, timeout=30) listing = [] listing_flags = [] meta_json_exists = False for file_entry in raw_listing: name = file_entry.name #name = slugify(name, lowercase=False, save_order=True) if name == 'meta.json': meta_json_exists = True listing.append({"name": name}) listing_flags.append({"selected": True, "disabled": True}) elif name.endswith("/"): listing.append({"name": name.rstrip("/")}) listing_flags.append({"selected": True, "disabled": False}) else: app_logger.info("Skip file {!r}".format(urljoin(remote_dir, name))) listing.append({"name": name}) listing_flags.append({"selected": False, "disabled": True}) if meta_json_exists is False: raise FileNotFoundError("meta.json") fields = [ #{"field": "state.projectName", "payload": slugify(project_name, lowercase=False, save_order=True)}, {"field": "state.projectName", "payload": project_name}, {"field": "data.listing", "payload": listing}, {"field": "state.listingFlags", "payload": listing_flags}, ] api.app.set_fields(task_id, fields) except Exception as e: api.task.set_field(task_id, "data.previewError", repr(e))
def download_directory(url, storage_path): #Given a url and a local directory, list and download the contents of an entire directory. #Change directory to the storage directory. os.chdir(storage_path) #Get a list of all files in the local directory #https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory _, _, local_filenames = next(os.walk(storage_path)) #Get remote directory listing via HTTP (uses 3rd party module htmllistparse) cwd, listing = htmllistparse.fetch_listing(url, timeout=30) #Filter listing for executable scripts. Uses a feature in Python called list comprehension listing = [ i.name for i in listing if not ('.py' in i.name or '.sh' in i.name) ] print(f"Listing: {listing}") #Iterate through all served files/directories to determine what is a file and what is a directory for thing in listing: #Determine if object is directory with / in name if '/' in thing: directory = thing directorypath = f"{storage_path}/{directory}" directory_noslash = directory.replace(os.path.sep, '') newurl = f"{url}/{directory_noslash}" #Check if directory exists locally if not os.path.isdir(directorypath): #If not, create directory os.mkdir(directorypath) #Recursively call this function until no more directories are found. download_directory(newurl, directorypath) #When complete, change back to original directory to reset the recursive "chdir"s os.chdir(storage_path) else: #Else... item is a file to download file = thing #Check if we've downloaded a file already. If so, skip it! if file not in local_filenames: download_string = f"{url}/{file}" #Call download_file to download the single file chunk by chunk. download_file(download_string, storage_path) else: print(f"Already downloaded {file}")
def startbattle(): # get results from API cwd, listing = htmllistparse.fetch_listing(website, timeout=10) battle = random.choice(listing) print(battle.name) replayfile = str(website) + str(battle.name) replaysave = temp_path + str(battle.name) # download replay try: urllib.request.urlretrieve(replayfile, replaysave) except urllib.error.URLError as e: return # run Observer os.system("ExampleObserver.exe --Path \"" + replaysave + "\"") # delete temp files tempfilelist = glob.glob(os.path.join(temp_path, "*.*")) for tempfile in tempfilelist: os.remove(tempfile)
def glob_data_sets(url): """Return list of all data sets behind the given ``url``.""" result = [] if url.scheme in data.PYFS_SCHEMES: curr_fs = data.make_fs(url) for match in curr_fs.glob("*.h5ad"): match_path = fs.path.basename(match.path) logger.info("Found data set %s at %s" % (match_path, data.redacted_urlunparse(url))) result.append( url._replace(path=fs.path.join(url.path, match.path[1:]))) elif url.scheme == "s3": anon = url.username is None and url.password is None s3 = s3fs.S3FileSystem(anon=anon, key=url.username, secret=url.password) if url.path: pattern = "%s/%s/*.h5ad" % (url.hostname, url.path) else: pattern = "%s/*.h5ad" % (url.hostname, ) for match in s3.glob(pattern): result.append(url._replace(path=match.split("/", 1)[1])) elif url.scheme.startswith("http"): cwd, listing = htmllistparse.fetch_listing(urlunparse(url), timeout=30) for entry in listing: if entry.name.endswith(".h5ad"): result.append(url._replace(path=fs.path.join(cwd, entry.name))) elif url.scheme.startswith("irods"): with data.create_irods_session(url) as irods_session: # Get pointed-to collection. collection = irods_session.collections.get(url.path) for data_obj in collection.data_objects: if data_obj.name.endswith(".h5ad"): result.append( url._replace( path=fs.path.join(url.path, data_obj.name))) else: raise ScelVisException("Invalid URL scheme: %s" % url.scheme) return result
import htmllistparse as ftp from epivizfileserver.parser import BigWig from joblib import Parallel, delayed import struct import pandas import json import pickle url = "https://egg2.wustl.edu/roadmap/data/byFileType/signal/consolidated/macs2signal/foldChange/" cwd, files = ftp.fetch_listing(url) print("total files - ", len(files)) def get_file_index(file, baseurl): print("processing file - ", file.name) bw = BigWig(baseurl + file.name) print("\t getting zoom headers") bw.getZoomHeader() print("\t get tree for full data offset") tree = bw.getTree(-2) bw.getId("chr1") ofile = open("objects/" + file.name + ".pickle", 'wb') pickle.dump(bw, ofile) # ifile = "trees/" + file.name + ".fulltreeindex" # print("\t writing index ", ifile) # with open(ifile, "wb") as f: # f.write(tree) # This will download the index from all the files
def all_orbitals_download(save_dir, minimum_year=None, hr_only=None): import htmllistparse import requests import os import logging logger = logging.getLogger('rinex_garner') logger.info('Creating {}/{}'.format(save_dir, 'gipsy_orbitals')) savepath = save_dir / 'gipsy_orbitals' if not os.path.exists(savepath): try: os.makedirs(savepath) except OSError: logger.error("Creation of the directory %s failed" % savepath) else: logger.info("Successfully created the directory %s" % savepath) else: logger.warning('Folder {} already exists.'.format(savepath)) command = 'https://sideshow.jpl.nasa.gov/pub/JPL_GPS_Products/Final/' cwd, listing = htmllistparse.fetch_listing(command, timeout=30) dirs = [f.name for f in listing if '/' in f.name] if minimum_year is not None: years = [int(x.split('/')[0]) for x in dirs] years = [x for x in years if x >= minimum_year] dirs = [str(x) + '/' for x in years] logger.info('starting search from year {}'.format(minimum_year)) for year in dirs: logger.info(year) cwd, listing = htmllistparse.fetch_listing(command + year, timeout=30) files = [f.name for f in listing if f.size is not None] # 2017-01-28.eo.gz # 2017-01-28.shad.gz # 2017-01-28_hr.tdp.gz # 2017-01-28.ant.gz # 2017-01-28.tdp.gz # 2017-01-28.frame.gz # 2017-01-28.pos.gz # 2017-01-28.wlpb.gz if hr_only is None: suffixes = ['eo', 'shad', 'ant', 'tdp', 'frame', 'pos', 'wlpb'] for suff in suffixes: found = [ f for f in files if suff in f.split('.')[1] and '_' not in f ] if found: for filename in found: logger.info('Downloading {} to {}.'.format( filename, savepath)) r = requests.get(command + year + filename) with open(savepath / filename, 'wb') as file: file.write(r.content) else: pre_found = [f for f in files if '_' in f] if pre_found: found = [ f for f in pre_found if f.split('.')[0].split('_')[1] == 'hr' ] if found: for filename in found: logger.info('Downloading {} to {}.'.format( filename, savepath)) r = requests.get(command + year + filename) with open(savepath / filename, 'w') as file: file.write(r.content) return
def start_import(api: sly.Api, task_id, context, state, app_logger): fields = [ {"field": "data.destinationError", "payload": ""}, {"field": "data.uploadError", "payload": ""}, {"field": "data.uploadStarted", "payload": True}, {"field": "data.uploadedCount", "payload": 0}, {"field": "data.totalCount", "payload": 0}, {"field": "data.uploadProgress", "payload": 0}, {"field": "data.uploadDsName", "payload": ""}, {"field": "data.uploadedDsCount", "payload": 0}, {"field": "data.totalDsCount", "payload": 0}, {"field": "data.uploadDsProgress", "payload": 0}, ] api.app.set_fields(task_id, fields) remote_dir = state["remoteDir"] listing_flags = state["listingFlags"] workspace_name = state["workspaceName"] project_name = state["projectName"] #slugify(state["projectName"], lowercase=False, save_order=True) if project_name == "": _show_error(api, task_id, "data.destinationError", "Project name is not defined", app_logger) return #@TODO: will be added in future releases add_to_existing_project = False #state["addToExisting"] existing_meta = None try: workspace = api.workspace.get_info_by_name(TEAM_ID, workspace_name) if workspace is None: workspace = api.workspace.create(TEAM_ID, workspace_name) app_logger.info("Workspace {!r} is created".format(workspace.name)) else: app_logger.info("Workspace {!r} already exists".format(workspace.name)) project = api.project.get_info_by_name(workspace.id, project_name) if project is None: project = api.project.create(workspace.id, project_name) app_logger.info("Project {!r} is created".format(project.name)) else: _show_error(api, task_id, "data.destinationError", "Project {!r} already exists".format(project.name), app_logger) return if add_to_existing_project is False: app_logger.warn("Project {!r} already exists. Allow add to existing project or change the name of " "destination project. We recommend to upload to new project. Thus the existing project " "will be safe. New name will be generated".format(project.name)) project = api.project.create(workspace.id, project_name, change_name_if_conflict=True) else: existing_meta_json = api.project.get_meta(project.id) existing_meta = sly.ProjectMeta.from_json(existing_meta_json) update_res_project_icon = None fields = [ {"field": "data.resultProject", "payload": project.name}, {"field": "data.resultProjectId", "payload": project.id}, # {"field": "data.resultProjectPreviewUrl", "payload": 0}, ] api.app.set_fields(task_id, fields) resp = requests.get(urljoin(remote_dir, 'meta.json')) meta_json = resp.json() meta = sly.ProjectMeta.from_json(meta_json) if existing_meta is not None: meta = existing_meta.merge(meta) api.project.update_meta(project.id, meta.to_json()) datasets_to_upload = [] for ds_info, flags in zip(listing, listing_flags): dataset_name = ds_info['name'] if flags["selected"] is False: app_logger.info("Folder {!r} is not selected, it will be skipped".format(dataset_name)) continue if flags["disabled"] is True: app_logger.info("File {!r} is skipped".format(dataset_name)) continue datasets_to_upload.append(dataset_name) api.task.set_field(task_id, "data.totalDsCount", len(datasets_to_upload)) for index, dataset_name in enumerate(datasets_to_upload): dataset = api.dataset.get_info_by_name(project.id, dataset_name) if dataset is None: dataset = api.dataset.create(project.id, dataset_name) app_logger.info("Dataset {!r} is created".format(dataset.name)) else: app_logger.warn("Dataset {!r} already exists. Uploading is skipped".format(dataset.name)) _increment_ds_progress(task_id, api, index + 1, len(datasets_to_upload)) continue #img_dir = reduce(urljoin, [remote_dir, dataset_name, 'img']) #ann_dir = reduce(urljoin, [remote_dir, dataset_name, 'ann']) img_dir = os.path.join(remote_dir, dataset_name, 'img/') ann_dir = os.path.join(remote_dir, dataset_name, 'ann/') cwd, img_listing = htmllistparse.fetch_listing(img_dir, timeout=30) uploaded_to_dataset = 0 fields = [ {"field": "data.totalCount", "payload": len(img_listing)}, {"field": "data.uploadDsName", "payload": dataset.name}, ] api.app.set_fields(task_id, fields) task_progress = sly.Progress("Uploading dataset {!r}".format(dataset.name), len(img_listing)) for batch in sly.batched(img_listing, batch_size=50): try: names = [] image_urls_batch = [] annotations_batch = [] for file_entry in batch: name = file_entry.name try: img_url = urljoin(img_dir, name) #'https://i.imgur.com/uFYNj9Z.jpg' ann_url = urljoin(ann_dir, name + sly.ANN_EXT) resp = requests.get(ann_url) if resp.status_code == 404: ann_url = urljoin(ann_dir, sly.fs.get_file_name(name) + sly.ANN_EXT) resp = requests.get(ann_url) resp.raise_for_status() ann_json = resp.json() ann = sly.Annotation.from_json(ann_json, meta) except Exception as e: app_logger.warn("Image {!r} and annotation {!r} are skipped due to error: {}" .format(img_url, ann_url, repr(e))) continue names.append(name) image_urls_batch.append(img_url) annotations_batch.append(ann) img_infos = api.image.upload_links(dataset.id, names, image_urls_batch) uploaded_ids = [img_info.id for img_info in img_infos] api.annotation.upload_anns(uploaded_ids, annotations_batch) uploaded_to_dataset += len(uploaded_ids) except Exception as e: app_logger.warn("Batch ({} items) of images is skipped due to error: {}" .format(len(batch), repr(e))) finally: task_progress.iters_done_report(len(batch)) _increment_task_progress(task_id, api, task_progress) #only once + to check the image urls are loaded correctly if update_res_project_icon is None: pinfo = api.project.get_info_by_id(project.id) if pinfo.reference_image_url is None: raise RuntimeError("Preview image is not accessible. Check that image URLs are public.") update_res_project_icon = api.image.preview_url(pinfo.reference_image_url, 100, 100), api.task.set_field(task_id, "data.resultProjectPreviewUrl", update_res_project_icon) _increment_ds_progress(task_id, api, index + 1, len(datasets_to_upload)) app_logger.info("Dataset {!r} is uploaded: {} images with annotations" .format(dataset.name, uploaded_to_dataset)) except Exception as e: app_logger.error(repr(e)) api.task.set_field(task_id, "data.uploadError", repr(e)) api.task.set_output_project(task_id, project.id, project.name) my_app.stop()