def celeryTaskFactoryUnique(job_num,job_package): # reconstitute form_data = job_package['form_data'] job_num = job_package['job_num'] # get FOXML FOXMLs_serialized = genFOXML("retrieve", form_data['MODS_id'], form_data['xsl_trans_id']) # update job info redisHandles.r_job_handle.set("job_{job_num}_est_count".format(job_num=job_num),len(FOXMLs_serialized)) # ingest in Fedora step = 1 for FOXML in FOXMLs_serialized: job_package['PID'] = "N/A" job_package['step'] = step job_package['FOXML'] = FOXML # fire ingester result = actions.actions.taskWrapper.delay(job_package) task_id = result.id # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_num) # bump step step += 1
def celeryTaskFactoryBagIngest(job_num,job_package): # reconstitute job_num = job_package['job_num'] # update job info redisHandles.r_job_handle.set("job_{job_num}_est_count".format(job_num=job_num),1) # ingest in Fedora step = 1 job_package['PID'] = "N/A" job_package['step'] = step # fire ingester result = actions.actions.taskWrapper.delay(job_package) task_id = result.id print task_id # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_num) # bump step step += 1
def celeryTaskFactoryImportMODS(job_num,job_package): ''' Problem - too big to send the MODS XML to Redis. Need to stick it in MySQL, or text file. Write to temp file. Case closed. ''' # reconstitute form_data = job_package['form_data'] job_num = job_package['job_num'] # get mods:collection if 'upload_data' in job_package: MODS_collection = job_package['upload_data'] job_package['upload_data'] = False #scrub data elif form_data['content'] != '': MODS_collection = form_data['content'] form_data['content'] = False #scrub data # shunt each MODS record to list MODS_collection = unicode(MODS_collection, 'utf-8') XMLroot = etree.fromstring(MODS_collection.encode('utf-8')) MODS_list = XMLroot.findall('{http://www.loc.gov/mods/v3}mods') # update job info redisHandles.r_job_handle.set("job_{job_num}_est_count".format(job_num=job_num),len(MODS_list)) # ingest in Fedora step = 1 for MODS_elem in MODS_list: # read <mods:extension><PID>, pass this as PID PID_search = MODS_elem.findall("{http://www.loc.gov/mods/v3}extension/PID") if len(PID_search) == 0: print "Could not find PID, skipping" continue else: PID = PID_search[0].text # write MODS to temp file temp_filename = "/tmp/Ouroboros/"+str(uuid.uuid4())+".xml" fhand = open(temp_filename,'w') fhand.write(etree.tostring(MODS_elem)) fhand.close() job_package['PID'] = PID job_package['step'] = step job_package['MODS'] = temp_filename # fire ingester result = actions.actions.taskWrapper.delay(job_package) task_id = result.id # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_num) # bump step step += 1
def pruneSolr_factory(job_package): # set new task_name, for the worker below job_package['custom_task_name'] = 'pruneSolr_worker' # get solr results obj solr_total = solr_handle.search(q='*:*', fl='id').total_results # set estimated tasks print "Antipcating",solr_total,"tasks...." redisHandles.r_job_handle.set("job_%s_est_count" % (job_package['job_num']), solr_total) # iterate through solr objects # variables start = 0 rows = 100 step = 1 while start < solr_total: # perform search solr_result = solr_handle.search(q='*:*', fl='id', rows=rows, start=start) # iterate for doc in solr_result.documents: doc_id = doc['id'] print "pruneSolr checking %s" % (doc_id) job_package['doc_id'] = doc_id # fire task via custom_loop_taskWrapper result = actions.actions.custom_loop_taskWrapper.apply_async(kwargs={'job_package':job_package}, queue=job_package['username'] ) task_id = result.id # Set handle in Redis redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (doc_id)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_package['job_num']) # bump step step += 1 # bump start start += rows
def celeryTaskFactory(**kwargs): # create job_package job_package = kwargs['job_package'] # get username username = job_package['username'] # get job_num job_num = kwargs['job_num'] # get and iterate through user selectedPIDs PIDlist = kwargs['PIDlist'] # task function for taskWrapper job_package['task_name'] = kwargs['task_name'] #set step counter step = 1 # iterate through PIDs for PID in PIDlist: time.sleep(.001) job_package['step'] = step job_package['PID'] = PID # fire off async task via taskWrapper result = taskWrapper.delay(job_package) task_id = result.id # Set handle in redisHandles.r_job_handle.set("{task_id}".format(task_id=task_id), "FIRED,{PID}".format(PID=PID)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_num) # bump step step += 1 print "Finished assigning tasks"
def obj_loop_taskFactory(**kwargs): # create job_package job_package = kwargs['job_package'] # username username = job_package['username'] # get job_num job_num = kwargs['job_num'] # get and iterate through user selectedPIDs PIDlist = kwargs['PIDlist'] # task function for obj_loop_taskWrapper job_package['task_name'] = kwargs['task_name'] #set step counter step = 1 # iterate through PIDs for PID in PIDlist: time.sleep(.001) job_package['step'] = step job_package['PID'] = PID # fire off async task via obj_loop_taskWrapper result = obj_loop_taskWrapper.apply_async(kwargs={'job_package':job_package,}, queue=username) task_id = result.id # Set handle in redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (PID)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_num) # bump step step += 1 print "Finished assigning tasks"
def MODSimport_factory(job_package): print "FIRING MODSimport_factory" # get form data form_data = job_package['form_data'] # set new task_name, for the worker below job_package['custom_task_name'] = 'MODSimport_worker' # get mods:collection if 'upload_data' in job_package: with open(job_package['upload_data'], 'r') as fhand: MODS_collection = fhand.read() elif form_data['content'] != '': MODS_collection = form_data['content'] # shunt each MODS record to list MODS_collection = unicode(MODS_collection, 'utf-8') XMLroot = etree.fromstring(MODS_collection.encode('utf-8')) MODS_list = XMLroot.findall('{http://www.loc.gov/mods/v3}mods') print MODS_list # update job info redisHandles.r_job_handle.set("job_%s_est_count" % (job_package['job_num']), len(MODS_list)) # ingest in Fedora step = 1 for MODS_elem in MODS_list: print "Loading %s / %s" % (step, len(MODS_list)) # read <mods:extension><PID>, pass this as PID PID_search = MODS_elem.findall("{http://www.loc.gov/mods/v3}extension/PID") if len(PID_search) == 0: print "Could not find PID, skipping" # bump step step += 1 continue else: PID = PID_search[0].text print "FOUND THE PID:",PID # write MODS to temp file temp_filename = "/tmp/Ouroboros/"+str(uuid.uuid4())+".xml" fhand = open(temp_filename,'w') fhand.write(etree.tostring(MODS_elem)) fhand.close() job_package['PID'] = PID job_package['step'] = step job_package['MODS'] = temp_filename # fire task via custom_loop_taskWrapper result = actions.actions.custom_loop_taskWrapper.apply_async(kwargs={'job_package':job_package}, queue=job_package['username']) task_id = result.id # Set handle in Redis redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (PID)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_package['job_num']) # bump step step += 1 print "Finished firing MODS import workers"
def bagIngest_factory(job_package): # get form data form_data = job_package['form_data'] if "ingest_type" in form_data: ingest_type = form_data['ingest_type'] else: return "No ingest type selected, aborting." # set new task_name, for the worker below job_package['custom_task_name'] = 'bagIngest_worker' # Single Ingest Type ################################################################# if ingest_type == "single": payload_location = job_package['form_data']['payload_location'] # create working directory in workspace bag_dir = payloadExtractor(payload_location,ingest_type) job_package['bag_dir'] = bag_dir # set estimated tasks print "Antipcating 1 tasks...." redisHandles.r_job_handle.set("job_%s_est_count" % (job_package['job_num']), 1) step = 1 result = actions.actions.custom_loop_taskWrapper.apply_async(kwargs={'job_package':job_package}, queue=job_package['username']) task_id = result.id # Set handle in Redis redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (bag_dir)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_package['job_num']) # bump step step += 1 print "Finished firing ingest workers" # Multiple Ingest Type ################################################################# if ingest_type == "multiple": # extract payload_location payload_location = job_package['form_data']['payload_location'] # create working directory in workspace bag_dir = payloadExtractor(payload_location,ingest_type) if bag_dir == False: print "Aborting" return False print "Bag dir at this point:",bag_dir # all items inside bag_dir bag_dirs_tuple = os.walk(bag_dir).next() # dirs if len(bag_dirs_tuple[1]) > 0: print "Directories detected, continuing" # archives if len(bag_dirs_tuple[2]) > 0: print "Archive files detected. Extracting and continuing." for archive in bag_dirs_tuple[2]: archive_filename = bag_dirs_tuple[0] + "/" + archive print archive_filename # extract to temp dir tar_handle = tarfile.open(archive_filename) tar_handle.extractall(path=bag_dirs_tuple[0]) os.system("rm %s" % (archive_filename)) # finally, rewalk bag_dirs_tuple = os.walk(bag_dir).next() # dirs bag_dirs = [ bag_dirs_tuple[0] + "/" + bag_name for bag_name in bag_dirs_tuple[1] ] print bag_dirs # set estimated tasks print "Antipcating",len(bag_dirs),"tasks...." redisHandles.r_job_handle.set("job_%s_est_count" % (job_package['job_num']), len(bag_dirs)) # iterate through bags step = 1 for bag_dir in bag_dirs: print "Ingesting %s / %s" % (step, len(bag_dirs)) job_package['bag_dir'] = bag_dir # fire task via custom_loop_taskWrapper result = actions.actions.custom_loop_taskWrapper.apply_async(kwargs={'job_package':job_package}, queue=job_package['username']) task_id = result.id # Set handle in Redis redisHandles.r_job_handle.set("%s" % (task_id), "FIRED,%s" % (bag_dir)) # update incrementer for total assigned jobs.jobUpdateAssignedCount(job_package['job_num']) # bump step step += 1 print "Finished firing ingest workers"