示例#1
0
 def jobVerification(download_dir):
     global resume_job
     if os.path.exists(download_dir + '/' + job_name):
         resume_yn = consoleutils.readKeyboard("The job already exists, do you want to resume the download? (y/n):", '^y$|^n$', 
             'Select y for Yes or n for No', None)
         if resume_yn == 'y':
             resume_job = True
             return True
         else:
             resume_job = False
             return False
     else:
         resume_job = False
         return True
示例#2
0
def main(argv):

    #This is where is all begins...
    print 'Anaconda v1 [socialray.org]'
    print ''

    #if the scaper directory does not exist, create one.
    download_dir = os.getcwd() + '/downloads'
    if not os.path.exists(download_dir):
        print download_dir + ' directory was not found. Creating....'
        os.mkdir(download_dir)

    #standard text for invalid entry
    invalid = 'Invalid entry, re-enter.'

    #read options from the command-line
    options, args = getopt.getopt(args, '', ['name=', 'resume', 'downloadall', 'depth=', 'starturl='])

    #if no options are provided, print out the help text.
    if len(options) == 0:
        print "Usage is python anaconda.py name=some_job_name"
       
    continue_if_exists = False
    downloadcategories = True
    download_all = False
    depth = 0
    for opt, val in options:
        if opt in ("-n", "--name"):
            job_name = value
        if opt in ("-r", "--resume"):
            resume_if_exists = True
        if opt in ("-a", "--downloadall"):
            downloadcategories = True            
        if opt in ("-d", "--depth"):
            depth = val
    
    if (Job.exists(job_name)):        
        if not resume_if_exists:
            if os.path.exists(self.job_dir):
                #job exists, we cannot proceed
                print "The specified job already exists. To resume a job, attach the '--resume' option"
                sys.exit(1) #EXIT!
    job = Job(job_name, download_dir, depth)    
    job.start()
    
    if not resume_job:
        dload_target = consoleutils.readKeyboard('What do I download? Categories=c, Categories and Articles=a (default is c):', '^c$|^a$',
            invalid, 'c')    
        #wiki will ban us if delay < 1 second
        fetch_delay = string.atoi(consoleutils.readKeyboard("Input Fetch Delay Seconds n (default is 2):", '^\d+$', 
            'Enter a number, or press enter to accept the default', '2'))
        scan_depth = string.atoi(consoleutils.readKeyboard("Max depth while crawling? (4):", '^\d+$',
            'Enter a number, or press enter to accept the default', '4'))
        #the download starts from this page
        start_url = consoleutils.readKeyboard('Url to begin downloading from:', None, invalid, None)
        
        #save to the job settings file
        job_settings_file = open(job_settings_filename, 'w')
        job_settings_file.writelines([dload_target + '\n', str(fetch_delay) + '\n', str(scan_depth) + '\n'])
        job_settings_file.close()
    else:
        #read from the previously saved settings file
        job_settings_file = open(job_settings_filename, 'r')
        dload_target, fetch_delay, scan_depth = withoutNewlines(job_settings_file)
        fetch_delay = string.atoi(fetch_delay)
        scan_depth = string.atoi(scan_depth)
        job_settings_file.close()

    try:
        cat_dload_queue = Queue.Queue()
        entry_dload_queue = Queue.Queue()
        cat_download_count = entry_download_count = 0

        if not resume_job:
            openJobFiles('w')
            addToCategoryQueue(start_url)
            print "Started download at " + str(datetime.datetime.utcnow())
        else:
            ok = rebuildState()
            openJobFiles('a') #open for append
            if ok: print "Resumed download at " + str(datetime.datetime.utcnow())
            else: sys.exit()

        if dload_target == 'c':
            downloadCategories()
        if dload_target == 'a':
            downloadCategories()
            downloadEntries()

        print "Download completed at " + str(datetime.datetime.utcnow())
    except:
        print sys.exc_info()
        closeJobFiles()
        print "Download aborted at " + str(datetime.datetime.utcnow())