def process(src_file, dest_path, delete_fetch_error): """ 1. read file, extract URL 2. fetch file from URL 3. write fetched file to dest_path 4. delete pointer file """ code, contents=file_contents(src_file) if not code.startswith("ok"): logging.error("Can't read file contents from '%s'" % src_file) return try: url=contents.strip() except: raise Exception("Invalid data in file: %s" % src_file) code, (http_code, headers, data)=fetch(url) if not code.startswith("ok"): if delete_fetch_error: code, _msg=rm(src_file) logging.warning("Attempting to delete source file '%s': %s" % (src_file, code)) raise Exception("Can't fetch page from url: %s" % url) try: http_code=int(http_code) except: pass if http_code!=200: logging.error("Can't fetch url '%s', http response code: %s" % (url, http_code)) return code, maybe_components=extract_url_filename(url) if not code.startswith("ok"): fbn=str(uuid.uuid1()) dest_filename=os.path.join(dest_path, fbn) else: fbn, fext=maybe_components dest_filename=os.path.join(dest_path, fbn)+fext try: exists=os.path.exists(dest_filename) except: exists=False if exists: fbn=str(uuid.uuid1()) dest_filename=os.path.join(dest_path, fbn) code, msg=atomic_write(dest_filename, data) if not code.startswith("ok"): raise Exception("Can't write to file '%s': %s" % (dest_filename, msg)) ctx={ "dest_filename": dest_filename ,"src_filename": src_file ,"url": url ,"http_code": http_code ,"headers": headers } ### no need code, msg=rm(src_file) if not code.startswith("ok"): logging.error("Can't delete '%s' : will probably cause excessive downloads..." % src_file) try: sys.stdout.write(json.dumps(ctx)+"\n") except: raise BrokenPipe()
def run(dest_path=None ,**_): if dest_path is not None: if not os.path.isdir(dest_path): raise Exception("Expecting a valid destination path '%s'" % dest_path) ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid : %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break iline=sys.stdin.readline().strip() #################### VALIDATE ## if we received two strings on the same line: url dst_path bits=iline.split(" ") l=len(bits) if l > 2 or l==0: logging.error("Invalid input line: %s" % iline) continue url=bits[0] bn=os.path.basename(url) if len(bits)==2: path=bits[1] else: if dest_path is not None: path=os.path.join(dest_path, bn) else: logging.warning("Didn't receive 'dest_path' from stdin and none specified on command line...") continue ####### WRITE CAPABILITY VERIFICATION code, result=can_write(path) if not code.startswith("ok") or not result: logging.warning("Won't be able to write to path '%s'... skipping download" % path) continue ####### DOWNLOAD code, (http_code, headers, data)=fetch(url) if not code.startswith("ok"): logging.warning("Error attempting to download: %s" % url) continue try: http_code=int(http_code) except: pass if http_code!=200: logging.warning("Can't fetch url '%s', http response code: %s" % (url, http_code)) continue code, msg=atomic_write(path, data) if not code.startswith("ok"): raise Exception("Can't write to file '%s': %s" % (path, msg)) ctx={ "dest_filename": path ,"url": url ,"http_code": http_code ,"headers": headers } try: sys.stdout.write(json.dumps(ctx)+"\n") except: raise Exception("Exiting... probably broken pipe")