def __init__(self, db_connection, logger, date, max_downloads=None, max_upload_workers=20, allow_repeat=False): self.max_downloads = max_downloads self.max_upload_workers = max_upload_workers self.total_downloads = 0 self.upload_queue = Queue() self.lock = Lock() self.allow_repeat = allow_repeat # Setup the database connection self.db_connection = db_connection self.job_serializer = Serializer(self.db_connection, job) self.granule_serializer = Serializer(self.db_connection, granule) self.logger = logger self.date = date self.logger.info('Creating a workflow') # Downloader that handles asynchronous downloads. self.logger.info('Creating aria2 downloader client') self.downloader = Downloader( on_download_error=self._on_download_error, on_download_complete=self._on_download_complete, callback_args=(self.lock, )) # Copernicus Search API self.logger.info('Creating copernicus API connector') end_date = (self.date + timedelta(days=1)) start_date = self.date # self._get_start_date(self.date, end_date) self.copernicus = Copernicus( start_date=start_date.isoformat(), end_date=end_date.isoformat(), rows_per_query=30, )
async def _upload(c, m): if (not os.path.isfile(Config.CRED_FILE)): await m.reply_text(text=tr.NOT_AUTHENTICATED_MSG) return if (not m.reply_to_message): await m.reply_text(text=tr.NOT_A_REPLY_MSG) return message = m.reply_to_message if (not message.media): await m.reply_text(text=tr.NOT_A_MEDIA_MSG) return if (not valid_media(message)): await m.reply_text(text=tr.NOT_A_VALID_MEDIA_MSG) return snt = await c.send_message(chat_id=m.chat.id, text=tr.PROCESSING, reply_to_message_id=m.message_id) download = Downloader(m) status, file = await download.start(progress, snt) if (not status): await snt.edit_text(text=file, parse_mode='markdown') return title = ' '.join(m.command[1:]) upload = Uploader(file, title) status, link = await upload.start(progress, snt) await snt.edit_text(text=link, parse_mode='markdown')
class Alojz(Reaction): help = "Weather forecast from https://alojz.cz - only in czech!\nRun: $ alojz [city=praha]" aliasses = ["alojz"] attributes = ["city"] downloader = None parser = None def init(self): self.downloader = Downloader() self.parser = Parser() # Default values self.ini.set("city", "praha") def do(self, city=None): if city is None or city == "": city = self.ini.get("city") return self.weather(city.lower()) def weather(self, city): page = self.downloader.download("https://alojz.cz/" + city) return self.parser.parse_class(page, "h2", "actual-forecast")
class Workflow: def __init__(self, db_connection, logger, date, max_downloads=None, max_upload_workers=20, allow_repeat=False): self.max_downloads = max_downloads self.max_upload_workers = max_upload_workers self.total_downloads = 0 self.upload_queue = Queue() self.lock = Lock() self.allow_repeat = allow_repeat # Setup the database connection self.db_connection = db_connection self.job_serializer = Serializer(self.db_connection, job) self.granule_serializer = Serializer(self.db_connection, granule) self.logger = logger self.date = date self.logger.info('Creating a workflow') # Downloader that handles asynchronous downloads. self.logger.info('Creating aria2 downloader client') self.downloader = Downloader( on_download_error=self._on_download_error, on_download_complete=self._on_download_complete, callback_args=(self.lock, )) # Copernicus Search API self.logger.info('Creating copernicus API connector') end_date = (self.date + timedelta(days=1)) start_date = self.date # self._get_start_date(self.date, end_date) self.copernicus = Copernicus( start_date=start_date.isoformat(), end_date=end_date.isoformat(), rows_per_query=30, ) def start(self, parent_state): # Start a new job in the database. self.job_id = self.job_serializer.post({ 'job_name': 'Sentinel-2 Downloader', 'start_time': datetime.now(), 'date_handled': self.date, 'status': JobStatus.STARTED, }) self.logger.set_job_id(self.job_id) self.logger.info('Starting workflow') # Upload workers self.logger.info('Creating S3 upload processes') self.upload_processes = [ Process(target=upload_worker, args=(self.upload_queue, self.job_id, i)) for i in range(self.max_upload_workers) ] parent_state.job_id = self.job_id self.at_least_one_failed_download = False # Start processes responsible for uploading downloaded files to S3 # in a concurrent fashion. self.logger.info('S3 upload processes started') [upload_process.start() for upload_process in self.upload_processes] # Let's read in each url and download them. count = 0 self.logger.info('Fetching granules info from Copernicus', f'Reading for date: {self.date.isoformat()}') for product in self.copernicus.read_feed(): # Check if granule is already in the database. existing = self.granule_serializer.get(product.id) if existing is None: # If not, add it to the database. self.granule_serializer.post({ 'uuid': product.id, 'title': product.title, 'copernicus_ingestion_date': product.ingestion_date, 'validated': False, 'downloader_job_id': self.job_id, 'download_status': DownloadStatus.NOT_STARTED, }) elif not self.allow_repeat and \ existing['download_status'] not in ['ERROR', 'INVALID']: # If it was not a failed download, just skip. continue # And start the download. url = product.get_download_link() self.logger.info(f'Starting granule download {product.id}', f'URL: {url}') self.downloader.start_download(product) self.granule_serializer.put( product.id, {'download_status': DownloadStatus.DOWNLOADING}) count += 1 if self.max_downloads is not None and count >= self.max_downloads: break while count - self.total_downloads >= 30: sleep(5) # Set max_downloads to actual number of downloads triggered. # This is needed so that we can send an DONE message to S3 uploader # when all download completes. self.lock.acquire() self.max_downloads = count # Of course, if all downloads have already been completed at this # point, we need to handle that case as well. if self.total_downloads == self.max_downloads: self.upload_queue.put('DONE') self.lock.release() # Join the separately running upload processes. [upload_process.join() for upload_process in self.upload_processes] self.logger.info('All granules downloaded', f'Total granules processed: {self.total_downloads}') parent_state.completed = True # At this point, we assume that all downloads and uploads have been # completed for this job. # Stop listening for any download notifications. self.downloader.stop_listening() # Set the job status to success. self.job_serializer.put(self.job_id, { 'end_time': datetime.now(), 'status': JobStatus.SUCCESS }) self.logger.info('Stopping workflow') self.logger.set_job_id(None) def _on_download_error(self, downloader, gid, lock): product = downloader.get_download_product(gid) error_message, error_code = downloader.get_download_error(gid) self.logger.error(f'Download error {product.id}', f'Error Code: {error_code}\n{error_message}') self.at_least_one_failed_download = True # Download status = ERROR self.granule_serializer.put(product.id, {'download_status': DownloadStatus.ERROR}) # If these are all the downloads, trigger the upload processes to quit. lock.acquire() self.total_downloads += 1 if self.total_downloads == self.max_downloads: self.upload_queue.put('DONE') lock.release() def _on_download_complete(self, downloader, gid, lock): product = downloader.get_download_product(gid) self.logger.info(f'Download complete {product.id}') # After each file downloads, we want to upload it to S3 bucket. filename = downloader.get_download_filename(gid) self.upload_queue.put((product.id, filename)) # If these are all the downloads, trigger the upload processes to quit. lock.acquire() self.total_downloads += 1 if self.total_downloads == self.max_downloads: self.upload_queue.put('DONE') lock.release()
def init(self): self.downloader = Downloader() self.parser = Parser() # Default values self.ini.set("city", "praha")