def hotspotsRange_ts(start_time, stop_time, location, **kwargs): ''' Run ofver a range of timesteps at 5 minute intervals in between ''' start = datetime.strptime(start_time, '%Y%m%d.%H%M%S') stop = datetime.strptime(stop_time, '%Y%m%d.%H%M%S') kwargs.update({'task_id': hotspotsRange.request.id}) job = TaskSet(tasks=[ cybercomq.gis.hotspotpysal.hotspots.subtask(args=(ts,location), kwargs=kwargs, queue="gis", track_started=True) for ts in date_range(start,stop) ]) job.apply_async(job) return '%s' % (hotspotsRange_ts.request.id)
def test_counter_taskset(self): increment_counter.count = 0 ts = TaskSet(tasks=[ increment_counter.s(), increment_counter.s(increment_by=2), increment_counter.s(increment_by=3), increment_counter.s(increment_by=4), increment_counter.s(increment_by=5), increment_counter.s(increment_by=6), increment_counter.s(increment_by=7), increment_counter.s(increment_by=8), increment_counter.s(increment_by=9), ]) self.assertEqual(ts.total, 9) consumer = increment_counter.get_consumer() consumer.purge() consumer.close() taskset_res = ts.apply_async() subtasks = taskset_res.subtasks taskset_id = taskset_res.taskset_id consumer = increment_counter.get_consumer() for subtask in subtasks: m = consumer.queues[0].get().payload self.assertDictContainsSubset( { 'taskset': taskset_id, 'task': increment_counter.name, 'id': subtask.id }, m) increment_counter( increment_by=m.get('kwargs', {}).get('increment_by')) self.assertEqual(increment_counter.count, sum(range(1, 10)))
def test_counter_taskset(self): increment_counter.count = 0 ts = TaskSet(tasks=[ increment_counter.s(), increment_counter.s(increment_by=2), increment_counter.s(increment_by=3), increment_counter.s(increment_by=4), increment_counter.s(increment_by=5), increment_counter.s(increment_by=6), increment_counter.s(increment_by=7), increment_counter.s(increment_by=8), increment_counter.s(increment_by=9), ]) self.assertEqual(ts.total, 9) consumer = increment_counter.get_consumer() consumer.purge() consumer.close() taskset_res = ts.apply_async() subtasks = taskset_res.subtasks taskset_id = taskset_res.taskset_id consumer = increment_counter.get_consumer() for subtask in subtasks: m = consumer.queues[0].get().payload self.assertDictContainsSubset({'taskset': taskset_id, 'task': increment_counter.name, 'id': subtask.id}, m) increment_counter( increment_by=m.get('kwargs', {}).get('increment_by')) self.assertEqual(increment_counter.count, sum(xrange(1, 10)))
def run(self, xlsx_file, record=None, *args, **kwargs): self.record = record self.update_state(state="INITIALIZING", meta=self.progress) csv_metadata = self.convert_excel_to_csv(xlsx_file) for csv_filename, headers, types in csv_metadata: self.load_csv_into_db(csv_filename, headers, types) clear_db() self.translate_data() for meta in csv_metadata: self.drop_csv_table(meta[0]) cursor = connection.cursor() cursor.execute( """ SELECT DISTINCT postcode FROM advisers_location""" ) postcodes = cursor.fetchall() self.total = len(postcodes) def chunks(n=1000): for i in xrange(0, len(postcodes), n): yield postcodes[i : i + n] self.update_count() tasks = [] for chunk in chunks(): t = GeocoderTask().subtask(args=(chunk,)) tasks.append(t) ts = TaskSet(tasks=tasks) res = ts.apply_async() task_counts = {} task_errors = {} def update_task_process(task_id, result): task_counts[task_id] = result.get("count") task_errors[task_id] = result.get("errors") while res.completed_count() < len(tasks): [update_task_process(r.task_id, r.result) for r in res if r.result] count = sum(task_counts.values()) errors = list(itertools.chain(*task_errors.values())) self.update_count(count, errors) time.sleep(1) cache.clear()
def handle(self, *args, **options): docs = RECAPDocument.objects.exclude(filepath_local='') if options['skip_ocr']: # Focus on the items that we don't know if they need OCR. docs = docs.filter(ocr_status=None) else: # We're doing OCR. Only work with those items that require it. docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) count = docs.count() print("There are %s documents to process." % count) if options.get('order') is not None: if options['order'] == 'small-first': docs = docs.order_by('page_count') elif options['order'] == 'big-first': docs = docs.order_by('-page_count') subtasks = [] completed = 0 for pk in docs.values_list('pk', flat=True): # Send the items off for processing. last_item = (count == completed) subtasks.append(extract_recap_pdf.subtask( (pk, options['skip_ocr']), priority=5, queue=options['queue'] )) # Every enqueue_length items, send the subtasks to Celery. if (len(subtasks) >= options['queue_length']) or last_item: msg = ("Sent %s subtasks to celery. We have sent %s " "items so far." % (len(subtasks), completed + 1)) logger.info(msg) print(msg) job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] completed += 1
def encode_video(cls, video_pk): ''' Task to encode a ``Video`` into one ore more ``VideoFile``'s. ''' try: video_obj = cls.objects.get(pk=video_pk) except cls.DoesNotExist: # video was removed return filecls = video_obj.videofile_set.model files = list(video_obj.videofile_set.all()) if video_obj.is_encoded and all(vfile.is_encoded for vfile in files): # video has been processed return video = video_obj.video # a video is required to meet requirments # to be encoded in specific a resolution if len(files) == 0: for setting in VideoSetting.objects.all(): if video.width >= setting.width or video.height >= setting.height: vfile = filecls.objects.create(original=video_obj,\ format=setting.format,\ width=setting.width, height=setting.height) files.append(vfile) log.info('%r can be encoded to %r' % (video, setting)) tasks = [] for vfile in files: if vfile.is_encoded: continue if vfile.width > 600 or vfile.height > 300: encode_video_file.delay(filecls, vfile.pk) else: task = encode_video_file_quick.subtask(args=(filecls, vfile.pk)) tasks.append(task) # create a taskset of the quick encodings job = TaskSet(tasks=tasks) result = job.apply_async() result.save() # start the publish_video callback when the taskset completes callback = publish_video.subtask(args=(cls, video_pk)) # max_tries has to be set otherwise it uses the default # check every 60 seconds if the set has completed join_taskset.delay(result.taskset_id, callback, interval=60, max_retries=300, propagate=False)
def make_download_tasks(data, line_count, start_line): """For every item in the CSV, send it to Celery for processing""" previous_casenum = None subtasks = [] completed = 0 for index, item in data.iterrows(): if completed < start_line - 1: # Skip ahead if start_lines is provided. completed += 1 continue if item['casenum'] != previous_casenum: # New case, get the docket before getting the pdf logger.info("New docket found with casenum: %s" % item['casenum']) previous_casenum = item['casenum'] filename = get_docket_filename(item['court'], item['casenum']) url = get_docketxml_url(item['court'], item['casenum']) subtasks.append(download_recap_item.subtask((url, filename))) # Get the document filename = get_document_filename(item['court'], item['casenum'], item['docnum'], item['subdocnum']) url = get_pdf_url(item['court'], item['casenum'], filename) subtasks.append(download_recap_item.subtask((url, filename))) # Every n items or on the last item, send the subtasks to Celery. last_item = (line_count == completed + 1) if (len(subtasks) >= 1000) or last_item: msg = ("Sent %s subtasks to celery. We have processed %s " "rows so far." % (len(subtasks), completed + 1)) logger.info(msg) print msg job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] completed += 1
def make_download_tasks(data, line_count, start_line): """For every item in the CSV, send it to Celery for processing""" previous_casenum = None subtasks = [] completed = 0 for index, item in data.iterrows(): if completed < start_line - 1: # Skip ahead if start_lines is provided. completed += 1 continue last_item = (line_count == completed + 1) if item['casenum'] != previous_casenum: # New case, get the docket before getting the pdf logger.info("New docket found with casenum: %s" % item['casenum']) previous_casenum = item['casenum'] filename = get_docket_filename(item['court'], item['casenum']) url = get_docketxml_url(item['court'], item['casenum']) subtasks.append(download_recap_item.subtask((url, filename))) # Get the document filename = get_document_filename(item['court'], item['casenum'], item['docnum'], item['subdocnum']) url = get_pdf_url(item['court'], item['casenum'], filename) subtasks.append(download_recap_item.subtask((url, filename))) # Every n items send the subtasks to Celery. if (len(subtasks) >= 1000) or last_item: msg = ("Sent %s subtasks to celery. We have processed %s " "rows so far." % (len(subtasks), completed + 1)) logger.info(msg) print msg job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] completed += 1
def test_function_taskset(self): subtasks = [return_True_task.s(i) for i in range(1, 6)] ts = TaskSet(subtasks) res = ts.apply_async() self.assertListEqual(res.join(), [True, True, True, True, True])
def test_function_taskset(self): with eager_tasks(self.app): subtasks = [return_True_task.s(i) for i in range(1, 6)] ts = TaskSet(subtasks) res = ts.apply_async() self.assertListEqual(res.join(), [True, True, True, True, True])