예제 #1
0
def run(polling_interval=None, source_url=None, 
        batch_size=None,
        format_json=None, propagate_error=None, check_path=None
        ,**_):
    
    proc=process(source_url, propagate_error, format_json, batch_size)
    
    if check_path is not None:
        ct=check_transition()

    ppid=os.getppid()    
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent pid: %s" % ppid)
    logging.info("Starting loop...")
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
        
        if check_path is not None:
            try:    exists=os.path.exists(check_path)
            except: exists=False
            
            maybe_tr, _=ct.send(exists)
            if maybe_tr=="tr" and exists:
                logging.info("Check path: passed")
            if maybe_tr=="tr" and not exists:
                logging.info("Check path: failed - skipping")
        else:
            ## fake 'exists'
            exists=True
                

        if exists:            
            #########################################################
            status, (code, headers, data)=fetch(source_url)
            if status.startswith("ok"):
                proc.send((code, headers, data))
            else:
                if propagate_error:
                    stdout('''{"status":"error", "kind":"fetch", "source_url":"%s", "http_code":"%s"}''' % (source_url, code))
            #########################################################

        logging.debug("...sleeping for %s seconds" % polling_interval)
        sleep(polling_interval)
예제 #2
0
def process(src_file, dest_path, delete_fetch_error):
    """
    1. read file, extract URL
    2. fetch file from URL
    3. write fetched file to dest_path
    4. delete pointer file
    """
    code, contents=file_contents(src_file)
    if not code.startswith("ok"):
        logging.error("Can't read file contents from '%s'" % src_file)
        return
    
    try:    
        url=contents.strip()
    except: 
        raise Exception("Invalid data in file: %s" % src_file)
    
    
    code, (http_code, headers, data)=fetch(url)
    if not code.startswith("ok"):
        if delete_fetch_error:
            code, _msg=rm(src_file)
            logging.warning("Attempting to delete source file '%s': %s" % (src_file, code))
        raise Exception("Can't fetch page from url: %s" % url)

    try:     http_code=int(http_code)
    except:  pass
    
    if http_code!=200:
        logging.error("Can't fetch url '%s', http response code: %s" % (url, http_code))
        return

    code, maybe_components=extract_url_filename(url)
    if not code.startswith("ok"):
        fbn=str(uuid.uuid1())
        dest_filename=os.path.join(dest_path, fbn)
    else:
        fbn, fext=maybe_components
        dest_filename=os.path.join(dest_path, fbn)+fext

    try:    exists=os.path.exists(dest_filename)
    except: exists=False
        
    if exists:
        fbn=str(uuid.uuid1())
        dest_filename=os.path.join(dest_path, fbn)
        
    code, msg=atomic_write(dest_filename, data)
    if not code.startswith("ok"):
        raise Exception("Can't write to file '%s': %s" % (dest_filename, msg))
    
    ctx={
         "dest_filename": dest_filename
         ,"src_filename": src_file
         ,"url": url
         ,"http_code": http_code
         ,"headers": headers
         }
    
    ### no need
    code, msg=rm(src_file)
    if not code.startswith("ok"):
        logging.error("Can't delete '%s' : will probably cause excessive downloads..." % src_file)
    
    
    try:    sys.stdout.write(json.dumps(ctx)+"\n")
    except: 
        raise BrokenPipe()
예제 #3
0
def run(dest_path=None 
        ,**_):
    
    if dest_path is not None:
        if not os.path.isdir(dest_path):
            raise Exception("Expecting a valid destination path '%s'" % dest_path)

    ppid=os.getppid()            
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent pid : %s" % ppid)
    logging.info("Starting loop...")
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
        
        iline=sys.stdin.readline().strip()
        
        #################### VALIDATE
        ## if we received two strings on the same line:  url  dst_path
        bits=iline.split(" ")
        l=len(bits)
        if l > 2 or l==0:
            logging.error("Invalid input line: %s" % iline)
            continue
        
        url=bits[0]
        bn=os.path.basename(url)
        
        if len(bits)==2:
            path=bits[1]
        else:
            if dest_path is not None:
                path=os.path.join(dest_path, bn)
            else:
                logging.warning("Didn't receive 'dest_path' from stdin and none specified on command line...")
                continue
            
        ####### WRITE CAPABILITY VERIFICATION
        code, result=can_write(path)
        if not code.startswith("ok") or not result:
            logging.warning("Won't be able to write to path '%s'... skipping download" % path)
            continue
            
        ####### DOWNLOAD
            
        code, (http_code, headers, data)=fetch(url)
        if not code.startswith("ok"):
            logging.warning("Error attempting to download: %s" % url)
            continue
        
        try:     http_code=int(http_code)
        except:  pass
        
        if http_code!=200:
            logging.warning("Can't fetch url '%s', http response code: %s" % (url, http_code))
            continue
                        
        code, msg=atomic_write(path, data)
        if not code.startswith("ok"):
            raise Exception("Can't write to file '%s': %s" % (path, msg))
        
        ctx={
             "dest_filename": path
             ,"url": url
             ,"http_code": http_code
             ,"headers": headers
             }
        
        try:    sys.stdout.write(json.dumps(ctx)+"\n")
        except: 
            raise Exception("Exiting... probably broken pipe")