def test_parallel_local_download(self): locald = LocalDownload(self.examples) (file_list, dir_list) = locald.list() locald.match([r'^test'], file_list, dir_list) list1 = [locald.files_to_download[0]] list2 = locald.files_to_download[1:] locald.close() locald1 = LocalDownload(self.examples) locald1.files_to_download = list1 locald2 = LocalDownload(self.examples) locald2.files_to_download = list2 t1 = DownloadThread(locald1, self.utils.data_dir) t2 = DownloadThread(locald2, self.utils.data_dir) t1.start() t2.start() t1.join() t2.join() self.assertTrue(len(t1.downloader.files_to_download) == 1) self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list1[0]['name'])) self.assertTrue(len(t2.downloader.files_to_download) == 2) self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[0]['name'])) self.assertTrue(os.path.exists(self.utils.data_dir + '/' +list2[1]['name']))
def test_parallel_local_download(self): locald = LocalDownload(self.examples) (file_list, dir_list) = locald.list() locald.match([r'^test'], file_list, dir_list) list1 = [locald.files_to_download[0]] list2 = locald.files_to_download[1:] locald.close() locald1 = LocalDownload(self.examples) locald1.files_to_download = list1 locald2 = LocalDownload(self.examples) locald2.files_to_download = list2 t1 = DownloadThread(locald1, self.utils.data_dir) t2 = DownloadThread(locald2, self.utils.data_dir) t1.start() t2.start() t1.join() t2.join() self.assertTrue(len(t1.downloader.files_to_download) == 1) self.assertTrue( os.path.exists(self.utils.data_dir + '/' + list1[0]['name'])) self.assertTrue(len(t2.downloader.files_to_download) == 2) self.assertTrue( os.path.exists(self.utils.data_dir + '/' + list2[0]['name'])) self.assertTrue( os.path.exists(self.utils.data_dir + '/' + list2[1]['name']))
def wf_download(self): """ Download remote files or use an available local copy from last production directory if possible. """ logging.info("Workflow:wf_download") flow = self.get_flow(Workflow.FLOW_DOWNLOAD) downloader = None cf = self.session.config self.session.previous_release = self.session.get("previous_release") if cf.get("protocol") == "multi": """ Search for: protocol = multi remote.file.0.protocol = directftp remote.file.0.server = ftp.ncbi.org remote.file.0.path = /musmusculus/chr1/chr1.fa => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2 remote.file.1.protocol = directhttp remote.file.1.server = ftp2.fr.debian.org remote.file.1.path = debian/README.html remote.file.1.method = GET remote.file.1.params.keys = key1,key2 remote.file.1.params.key1 = value1 remote.file.1.params.key2 = value2 => http://ftp2.fr.debian.org/debian/README.html #POST PARAMS: key1=value key2=value2 remote.file.1.protocol = directhttp remote.file.1.server = ftp2.fr.debian.org remote.file.1.path = debian/README.html remote.file.1.method = POST remote.file.1.params.keys = key1,key2 remote.file.1.params.key1 = value1 remote.file.1.params.key2 = value2 ...... """ downloader = MultiDownload() downloaders = [] # Creates multiple downloaders i = 0 rfile = cf.get("remote.file." + str(i) + ".path") while rfile is not None: if cf.get("remote.file." + str(i) + ".protocol") is not None: protocol = cf.get("remote.file." + str(i) + ".protocol") else: protocol = cf.get("protocol") if cf.get("remote.file." + str(i) + ".server") is not None: server = cf.get("remote.file." + str(i) + ".server") else: server = cf.get("server") subdownloader = self.get_handler(protocol, server, "", [cf.get("remote.file." + str(i) + ".path")]) if cf.get("remote.file." + str(i) + ".credentials") is not None: credentials = cf.get("remote.file." + str(i) + ".credentials") else: credentials = cf.get("server.credentials") if credentials is not None: subdownloader.set_credentials(credentials) if protocol == "directhttp": subdownloader.method = cf.get("remote.file." + str(i) + ".method") if subdownloader.method is None: subdownloader.method = "GET" if cf.get("remote.file." + str(i) + ".name"): subdownloader.save_as = cf.get("remote.file." + str(i) + ".name") else: subdownloader.save_as = cf.get("remote.file." + str(i) + ".path") if cf.get("remote.file." + str(i) + ".method"): subdownloader.method = cf.get("remote.file." + str(i) + ".method").strip().upper() subdownloader.params = {} keys = cf.get("remote.file." + str(i) + ".params.keys") if keys is not None: keys = keys.split(",") for key in keys: param = cf.get("remote.file." + str(i) + ".params." + key.strip()) subdownloader.param[key.strip()] = param.strip() downloaders.append(subdownloader) i += 1 rfile = cf.get("remote.file." + str(i) + ".path") downloader.add_downloaders(downloaders) else: """ Simple case, one downloader with regexp """ protocol = cf.get("protocol") if protocol == "directhttp" or protocol == "directftp": downloader = self.get_handler(cf.get("protocol"), cf.get("server"), "/", [cf.get("remote.dir")[:-1]]) downloader.method = cf.get("url.method") if downloader.method is None: downloader.method = "GET" downloader.save_as = cf.get("target.name") keys = cf.get("url.params") if keys is not None: keys = keys.split(",") for key in keys: param = cf.get(key.strip() + ".value") downloader.param[key.strip()] = param.strip() else: downloader = self.get_handler(cf.get("protocol"), cf.get("server"), cf.get("remote.dir")) if downloader is None: logging.error("Protocol " + cf.get("protocol") + " not supported") return False (file_list, dir_list) = downloader.list() downloader.match(cf.get("remote.files").split(), file_list, dir_list) for f in downloader.files_to_download: f["save_as"] = f["name"] for p in cf.get("remote.files").split(): res = re.match("/" + p, f["name"]) if res is not None and res.groups() is not None and len(res.groups()) >= 1: f["save_as"] = "/".join(res.groups()) break self.session.set("download_files", downloader.files_to_download) if self.session.get("release") is None: # Not defined, or could not get it ealier # Set release to most recent file to download release_dict = Utils.get_more_recent_file(downloader.files_to_download) if release_dict is None: today = datetime.datetime.now() release_dict = {"year": today.year, "month": today.month, "day": today.day} release = str(release_dict["year"]) + "-" + str(release_dict["month"]) + "-" + str(release_dict["day"]) self.session.set("release", release) self.session.set("remoterelease", release) # We restart from scratch, check if directory with this release already exists if self.options.get_option(Options.FROMSCRATCH): index = 0 # Release directory exits, set index to 1 if os.path.exists(self.session.get_full_release_directory()): index = 1 for x in range(1, 100): if os.path.exists(self.session.get_full_release_directory() + "__" + str(x)): index = x + 1 # while os.path.exists(self.session.get_full_release_directory()+'__'+str(index)): # index += 1 # If we found a directory for this release: XX or XX__Y if index > 0: self.session.set("release", release + "__" + str(index)) release = release + "__" + str(index) logging.info("Workflow:wf_download:release:remoterelease:" + self.session.get("remoterelease")) logging.info("Workflow:wf_download:release:release:" + release) MongoConnector.banks.update({"name": self.bank.name}, {"$set": {"status.release.progress": str(release)}}) self.download_go_ahead = False if self.options.get_option(Options.FROM_TASK) == "download": # We want to download again in same release, that's fine, we do not care it is the same release self.download_go_ahead = True if not self.download_go_ahead and self.session.previous_release == self.session.get("remoterelease"): logging.info("Workflow:wf_release:same_as_previous_session") return self.no_need_to_update() self.banks = MongoConnector.banks self.bank.bank = self.banks.find_one({"name": self.name}) nb_prod_dir = len(self.bank.bank["production"]) offline_dir = self.session.get_offline_directory() copied_files = [] # Check if already in offlinedir keep_files = [] if os.path.exists(offline_dir): for file_to_download in downloader.files_to_download: # If file is in offline dir and has same date and size, do not download again if os.path.exists(offline_dir + "/" + file_to_download["name"]): try: file_stat = os.stat(offline_dir + "/" + file_to_download["name"]) f_stat = datetime.datetime.fromtimestamp( os.path.getmtime(offline_dir + "/" + file_to_download["name"]) ) year = str(f_stat.year) month = str(f_stat.month) day = str(f_stat.day) if ( str(file_stat.st_size) != str(file_to_download["size"]) or str(year) != str(file_to_download["year"]) or str(month) != str(file_to_download["month"]) or str(day) != str(file_to_download["day"]) ): logging.debug("Workflow:wf_download:different_from_offline:" + file_to_download["name"]) keep_files.append(file_to_download) else: logging.debug("Workflow:wf_download:offline:" + file_to_download["name"]) except Exception as e: # Could not get stats on file os.remove(offline_dir + "/" + file_to_download["name"]) keep_files.append(file_to_download) else: keep_files.append(file_to_download) downloader.files_to_download = keep_files self.download_go_ahead = False if self.options.get_option(Options.FROM_TASK) == "download": # We want to download again in same release, that's fine, we do not care it is the same release self.download_go_ahead = True if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0: # for prod in self.bank.bank['production']: # if self.session.get('release') == prod['release']: # logging.info('Workflow:wf_release:same_as_previous_production_dir') # return self.no_need_to_update() # Get last production last_production = self.bank.bank["production"][nb_prod_dir - 1] # Get session corresponding to production directory last_production_session = self.banks.find_one( {"name": self.name, "sessions.id": last_production["session"]}, {"sessions.$": 1} ) last_production_dir = os.path.join( last_production["data_dir"], cf.get("dir.version"), last_production["release"] ) # Checks if some files can be copied instead of downloaded downloader.download_or_copy(last_production_session["sessions"][0]["files"], last_production_dir) if len(downloader.files_to_download) == 0: return self.no_need_to_update() # release_dir = os.path.join(self.session.config.get('data.dir'), # self.session.config.get('dir.version'), # self.session.get_release_directory()) logging.debug("Workflow:wf_download:Copy files from " + last_production_dir) copied_files = downloader.files_to_copy Utils.copy_files(downloader.files_to_copy, offline_dir) downloader.close() DownloadThread.NB_THREAD = int(self.session.config.get("files.num.threads")) if cf.get("protocol") == "multi": thlist = DownloadThread.get_threads_multi(downloader.downloaders, offline_dir) else: thlist = DownloadThread.get_threads(downloader, offline_dir) running_th = [] for th in thlist: running_th.append(th) th.start() while len(running_th) > 0: try: # Join all threads using a timeout so it doesn't block # Filter out threads which have been joined or are None running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()] logging.debug("Workflow:wf_download:Download:Threads:" + str(running_th)) except KeyboardInterrupt: logging.warn("Ctrl-c received! Sending kill to threads...") logging.warn("Running tasks will continue and process will stop.") for t in running_th: t.downloader.kill_received = True logging.info("Workflow:wf_download:Download:Threads:Over") # for th in thlist: # th.join() is_error = False for th in thlist: if th.error: is_error = True downloader.error = True break self.downloaded_files = downloader.files_to_download + copied_files # self.downloaded_files = downloader.download(offline_dir) + copied_files # downloader.close() if downloader.error: logging.error("An error occured during download") return False return True