def test_remote_download_urls_extract(self): executor.delete_dir(REMOTE_TARGET, TEST_DATA.creds) wget.download(REMOTE_TARGET, self.urls, creds=TEST_DATA.creds, extract=True) act_hash = hasher.get_md5(REMOTE_TARGET, TEST_DATA.creds) assert act_hash == TEST_DATA.download_md5_extracted, "remote download failed. Expected: {}, Actual: {}".format( TEST_DATA.download_md5, act_hash )
def test_local_download_urls_extract(self): shutil.rmtree(LOCAL_TARGET) wget.download(LOCAL_TARGET, TEST_DATA.zip_url,\ filenames='x.zip', extract=True) assert len(glob('{}/*.xml'.format( LOCAL_TARGET))) > 0, 'Failed to dowload/extract the url.' shutil.rmtree(LOCAL_TARGET)
def test_local_download_urls(self): executor.delete_dir(REMOTE_TARGET, TEST_DATA.creds) wget.download(LOCAL_TARGET, self.urls) act_hash = hasher.get_md5(LOCAL_TARGET) assert act_hash == TEST_DATA.download_md5, "local download failed. Expected: {}, Actual: {}".format( TEST_DATA.download_md5, act_hash )
def test_remote_download_urls(self): executor.delete_dir(REMOTE_TARGET, TEST_DATA.creds) wget.download(REMOTE_TARGET, self.urls, creds=TEST_DATA.creds) act_hash = hasher.get_md5(REMOTE_TARGET, TEST_DATA.creds) assert act_hash==TEST_DATA.download_md5,\ 'remote download failed. Expected: {}, Actual: {}'\ .format(TEST_DATA.download_md5, act_hash)
def test_download_with_key(self): executor.delete_dir(REMOTE_TARGET, TEST_DATA.creds) wget.download(REMOTE_TARGET, self.urls, creds=TEST_DATA.creds) rsync.download(REMOTE_TARGET, LOCAL_TARGET, creds=TEST_DATA.creds) act_hash = hasher.get_md5(REMOTE_TARGET, TEST_DATA.creds) assert act_hash == TEST_DATA.download_md5, "upload failed. Expected: {}, Actual: {}".format( TEST_DATA.download_md5, act_hash )
def test_download_with_key(self): executor.delete_dir(REMOTE_TARGET, TEST_DATA.creds) wget.download(REMOTE_TARGET, self.urls, creds=TEST_DATA.creds) rsync.download(REMOTE_TARGET, LOCAL_TARGET, creds=TEST_DATA.creds) act_hash = hasher.get_md5(REMOTE_TARGET, TEST_DATA.creds) assert act_hash==TEST_DATA.download_md5,\ 'upload failed. Expected: {}, Actual: {}'\ .format(TEST_DATA.download_md5, act_hash)
def main(lon, lat): '''First argument degrees longitude (E is positive, W negative) of the landslide location, second argument latitude (N positive, S negative), in decimal format(not minutes etc.)''' nlat, wlon, slat, elon = ten_km_square(lat, lon) #print("(NLat:{:.4f},WLon:{:.4f}),(SLat:{:.4f},ELon:{:.4f});".format(nlat, wlon, slat, elon)) region = ( '[{:.4f},{:.4f}], [{:.4f},{:.4f}], [{:.4f},{:.4f}], [{:.4f},{:.4f}]'. format(wlon, nlat, elon, nlat, wlon, slat, elon, slat)) rectangle = ('[{:.4f},{:.4f},{:.4f},{:.4f}]'.format( wlon, slat, elon, nlat)) lister = region.split('/') finallist = [ast.literal_eval(i) for i in lister] # to remove apostrophe final = [list(value) for value in finallist] # final list print(final) import ee import ee.mapclient import ee ee.Initialize() import ee.mapclient start = ee.Date('2017-01-01') finish = ee.Date('2017-03-20') rectangle = ee.Geometry.Polygon( final ) #passing the final value as the polygon boundaries defined 10km square area collection = ee.ImageCollection('COPERNICUS/S2') filteredCollection = collection.filterBounds(rectangle).filterDate( start, finish) first = filteredCollection.first() path = first.getDownloadURL({'region': final, 'scale': 10}) print(path) urls = [path] wget.download( '/home/path/to/save', urls[0], filenames='x.zip', extract=True) #path to your location to save a particular image
def get_pdfs(listofurls): if len(listofurls)<1: return False os.mkdir('/tmp/havenpdfs',0755); numberdownload = 0 try: items = wget.download('/tmp/havenpdfs', listofurls, extract=False) numberdownload = numberdownload + 1 except: print numberdownload print "Problem..." return numberdownload
if not(os.path.exists(glove_dir)): os.makedirs(glove_dir) with open(osp.join(glove_dir,name ), "wb") as code: code.write(f.read()) ''' if __name__ == "__main__": vector_size = [50, 150, 300] vector_names_glove = [ "glove_wiki_50.txt", "glove_wiki_150.txt", "glove_wiki_300.txt" ] vector_names_word2vec_50 = ["w2v_wiki_50.txt"] print "Starting download" wget.download(glove_dir, urls=glove_urls, filenames=vector_names_glove, parallelism=3) # print "Downloaded", vector_names_glove #wget.download(word2vec_dir, urls=w2v_50, filenames=w2v_names_50, parallelism=3) #print "Downloaded", w2v_names_50 #wget.download(word2vec_dir, urls=w2v_150, filenames=w2v_names_150, parallelism=3) # print "Downloaded", w2v_names_150 #wget.download(word2vec_dir, urls=w2v_300, filenames=w2v_names_300, parallelism=3) # print "Downloaded", w2v_names_300 # wget.download(glove_dir, urls=[glove_urls[0]],filenames=[vector_names[0]],parallelism=3) # wget.download(glove_dir, urls=[glove_urls[1]],filenames=[vector_names[1]],parallelism=3) # wget.download(glove_dir, urls=[glove_urls[2]],filenames=[vector_names[2]],parallelism=3) ''' This takes some time. The files are quite big.
def test_local_download_urls_extract(self): shutil.rmtree(LOCAL_TARGET) wget.download(LOCAL_TARGET, TEST_DATA.zip_url, filenames="x.zip", extract=True) assert len(glob("{}/*.xml".format(LOCAL_TARGET))) > 0, "Failed to dowload/extract the url." shutil.rmtree(LOCAL_TARGET)
mesDrugs = mesResults['drug'] mesEvents = mesDrugs['event'] mesPartitions = mesEvents['partitions'] os.chdir('data') for partition in mesPartitions: url = partition['file'] print(url) file_name = url.split('/')[-1] dir_name = url.split('/')[-2] if not os.path.isfile(os.path.join(dir_name, file_name[0:-4])): if not os.path.isdir(dir_name): os.makedirs(dir_name) os.chdir(dir_name) monfichier = wget.download(os.getcwd(), url, filenames=file_name) monzip = zipfile.ZipFile(file_name, 'r') monzip.extractall() os.remove(file_name) os.chdir('..') os.chdir('..') #I save the fields needed to answer question 1 from the document provided. #I planned to make use of safetyreportid, safetyreportversion, companynumb, but I did not have time in the end. #It needs to be done in a more finalised version #There are several reaction (reactionmeddrapt) fields per record, and we create one separate line for each in the output file all_files = [] os.chdir('data')
def main3(imgs): wget.download('.', imgs)
import urllib import re from parallel_sync import wget from os.path import abspath, dirname, join # # parse html # link = "http://ronnywang-newsdiff.s3-website-ap-northeast-1.amazonaws.com/2016/" fp = urllib.urlopen(link) myHTML = fp.read() matches = re.findall('<a href="201602([^\"]+)">', myHTML, re.DOTALL) #print(matches) # # download # prefix = 'http://ronnywang-newsdiff.s3-website-ap-northeast-1.amazonaws.com/2016/201602' downList = [ prefix + x for x in matches] #print(downList) targetPath = join(dirname(abspath(__file__)),'extract') wget.download(targetPath, downList, extract=True)
for line in status: orders = int(line) status.close() if data['lender'] > orders: status = open('status.txt', 'w') status.write(str(data['lender'])) status.close() media_url = data['path'].split('/') path = '' for i in range(media_url.index('media'), len(media_url)): path = os.path.join(path, media_url[i]) path_details = os.path.join(path, 'details.txt') mk_dir = os.path.join('/home/flytech/Documents/Projekty/WebMediaRoot', data['name']) if not os.path.exists(mk_dir): os.mkdir(mk_dir) download_target = [os.path.join('http://127.0.0.1:8000/', path_details)] wget.download(mk_dir, download_target) details = open(os.path.join(mk_dir, 'details.txt'), 'r') photos = [] path_photo = os.path.join('http://127.0.0.1:8000/', path) for line in details: photos.append(os.path.join(path_photo, line.split(', ')[0])) wget.download(mk_dir, photos) time.sleep(1)
def parse_items(self, response): # Create an item item = NetCDFScraperItem() # Populate it item["url"] = response.url item["date"] = str(datetime.datetime.utcnow()) item["status"] = "UNKWN" if response.url.endswith(".nc4") or response.url.endswith( ".nc") or response.url.endswith("dods"): stored_item = None # Try to check if this url has been visided using mongodb client = MongoClient(self.mongodb_url) user = self.configSectionMap("mongodb")['user'] password = self.configSectionMap("mongodb")['password'] print "user:"******" password:"******"url": response.url}) else: print "No MongoDb authentication" client.close() if stored_item is not None: # Now a very trivial beahviour # It should be smarter item["status"] = "ASTRD" return item downloaded = False tempdir = "" filename = "" # Check if it is served by an opendap server try: # The netcdf file is hosted by a opendap server # There is non need to download it rootgrp = Dataset(response.url) rootgrp.close() filename = response.url except: # Unfortunatly the netcdf have to be downloaded tempdir = tempfile._get_default_tempdir() + "/" + next( tempfile._get_candidate_names()) path = urlparse.urlsplit(response.url).path filename = tempdir + "/" + posixpath.basename(path) wget.download(tempdir, response.url) downloaded = True # Get the feature netCDF2JSON = NetCDF2JSON() feature = netCDF2JSON.get(filename, response.url) # Remove the downloaed file and directory if needed if downloaded is True: os.remove(filename) os.rmdir(tempdir) # Check if the feature is valid if feature is not None: #print json.dumps(feature,None,"\t") # Try to save the item in mongodb client = MongoClient(self.mongodb_url) if client.netcdf.authenticate(user, password) is True: db = client.netcdf items = db.items item_id = items.insert(feature) item["status"] = "NEWFT" else: print "No MongoDb authentication" client.close() # Return the item return item
def download_urls(): """ downloads some files """ urls = ['http://www.nationalgeographic.com/dc/exposure/homepage/photoconfiguration/image/70759_photo_nxqzsecnr7nwui2pbv33cboxp3vu2hmpyjyavf6lo6pvvsfavj3q_850x478.jpg',\ 'http://www.nationalgeographic.com/dc/exposure/homepage/photoconfiguration/image/70867_photo_g2j2wmgshw2nhigkyrhbstxkylvu2hmpyjyavf6lo6pvvsfavj3q_850x478.jpg'] wget.download('/tmp/images', urls, creds=env)
bucket = 'colab-pea' filename = './10MB' path = 'test.zip' #s3.upload_file(filename, bucket, path) urls = [ 'https://newsmaze.net/storage/new_proof_disk/Pea/Director1.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/Director2.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/Director3.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/Director4.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/Director5.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/ELGBlockchanin.7z', 'https://newsmaze.net/storage/new_proof_disk/Pea/EnergyProject.7z' ] # wget.download('/tmp', urls) # or a single file: wget.download(os.path.join(os.getcwd(), 'tmp'), urls, tries=10) for url in urls: try: filename = re.search('\w*\.\w*$', url).group(0) path = os.path.join(os.getcwd(), 'tmp', filename) #path = f'./tmp/{filename}' #wget.download(os.path.join(os.getcwd(),'tmp'), url, filenames=filename) print(f'uploading {filename} to s3') s3.upload_file(path, bucket, filename) #os.remove(path) except: print(f'error downloading file {url} error: {sys.exc_info()}')
from parallel_sync import wget import os target_path = os.getcwd() url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2" wget.download(target_path, urls=url)
import urllib import re from parallel_sync import wget from os.path import abspath, dirname, join # # parse html # link = "http://ronnywang-newsdiff.s3-website-ap-northeast-1.amazonaws.com/2016/" fp = urllib.urlopen(link) myHTML = fp.read() matches = re.findall('<a href="201602([^\"]+)">', myHTML, re.DOTALL) #print(matches) # # download # prefix = 'http://ronnywang-newsdiff.s3-website-ap-northeast-1.amazonaws.com/2016/201602' downList = [prefix + x for x in matches] #print(downList) targetPath = join(dirname(abspath(__file__)), 'extract') wget.download(targetPath, downList, extract=True)