def run(polling_interval=None, source_url=None, batch_size=None, format_json=None, propagate_error=None, check_path=None ,**_): proc=process(source_url, propagate_error, format_json, batch_size) if check_path is not None: ct=check_transition() ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid: %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break if check_path is not None: try: exists=os.path.exists(check_path) except: exists=False maybe_tr, _=ct.send(exists) if maybe_tr=="tr" and exists: logging.info("Check path: passed") if maybe_tr=="tr" and not exists: logging.info("Check path: failed - skipping") else: ## fake 'exists' exists=True if exists: ######################################################### status, (code, headers, data)=fetch(source_url) if status.startswith("ok"): proc.send((code, headers, data)) else: if propagate_error: stdout('''{"status":"error", "kind":"fetch", "source_url":"%s", "http_code":"%s"}''' % (source_url, code)) ######################################################### logging.debug("...sleeping for %s seconds" % polling_interval) sleep(polling_interval)
def process(src_file, dest_path, delete_fetch_error): """ 1. read file, extract URL 2. fetch file from URL 3. write fetched file to dest_path 4. delete pointer file """ code, contents=file_contents(src_file) if not code.startswith("ok"): logging.error("Can't read file contents from '%s'" % src_file) return try: url=contents.strip() except: raise Exception("Invalid data in file: %s" % src_file) code, (http_code, headers, data)=fetch(url) if not code.startswith("ok"): if delete_fetch_error: code, _msg=rm(src_file) logging.warning("Attempting to delete source file '%s': %s" % (src_file, code)) raise Exception("Can't fetch page from url: %s" % url) try: http_code=int(http_code) except: pass if http_code!=200: logging.error("Can't fetch url '%s', http response code: %s" % (url, http_code)) return code, maybe_components=extract_url_filename(url) if not code.startswith("ok"): fbn=str(uuid.uuid1()) dest_filename=os.path.join(dest_path, fbn) else: fbn, fext=maybe_components dest_filename=os.path.join(dest_path, fbn)+fext try: exists=os.path.exists(dest_filename) except: exists=False if exists: fbn=str(uuid.uuid1()) dest_filename=os.path.join(dest_path, fbn) code, msg=atomic_write(dest_filename, data) if not code.startswith("ok"): raise Exception("Can't write to file '%s': %s" % (dest_filename, msg)) ctx={ "dest_filename": dest_filename ,"src_filename": src_file ,"url": url ,"http_code": http_code ,"headers": headers } ### no need code, msg=rm(src_file) if not code.startswith("ok"): logging.error("Can't delete '%s' : will probably cause excessive downloads..." % src_file) try: sys.stdout.write(json.dumps(ctx)+"\n") except: raise BrokenPipe()
def run(dest_path=None ,**_): if dest_path is not None: if not os.path.isdir(dest_path): raise Exception("Expecting a valid destination path '%s'" % dest_path) ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid : %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break iline=sys.stdin.readline().strip() #################### VALIDATE ## if we received two strings on the same line: url dst_path bits=iline.split(" ") l=len(bits) if l > 2 or l==0: logging.error("Invalid input line: %s" % iline) continue url=bits[0] bn=os.path.basename(url) if len(bits)==2: path=bits[1] else: if dest_path is not None: path=os.path.join(dest_path, bn) else: logging.warning("Didn't receive 'dest_path' from stdin and none specified on command line...") continue ####### WRITE CAPABILITY VERIFICATION code, result=can_write(path) if not code.startswith("ok") or not result: logging.warning("Won't be able to write to path '%s'... skipping download" % path) continue ####### DOWNLOAD code, (http_code, headers, data)=fetch(url) if not code.startswith("ok"): logging.warning("Error attempting to download: %s" % url) continue try: http_code=int(http_code) except: pass if http_code!=200: logging.warning("Can't fetch url '%s', http response code: %s" % (url, http_code)) continue code, msg=atomic_write(path, data) if not code.startswith("ok"): raise Exception("Can't write to file '%s': %s" % (path, msg)) ctx={ "dest_filename": path ,"url": url ,"http_code": http_code ,"headers": headers } try: sys.stdout.write(json.dumps(ctx)+"\n") except: raise Exception("Exiting... probably broken pipe")