def process_region(input_seqs, trop_dict, basename, database=None, mrbayes_args={}, extra_fields={}): try: if os.path.exists(basename + '.tree'): return handle = open(basename + '.tree', 'w') logging.info('Making Tree ' + str(extra_fields)) contree, treeset = make_mrbayes_trees(input_seqs, **mrbayes_args) contree.write_to_stream(handle, 'nexus') treeset.write_to_path(basename + '.treeset', 'nexus') except IOError: return except OSError: return bats_write_subtask = subtask('TreeingTools.write_results_to_mongo', (), { 'result_type': 'BATS', 'extra_fields': extra_fields, 'database': database }) logging.info('Starting BATS ' + str(extra_fields)) run_bats.apply_async(args=(basename + '.treeset', trop_dict), kwargs = {'nreps': 50}, link = bats_write_subtask) benj_write_subtask = subtask('TreeingTools.write_results_to_mongo', (), { 'result_type': 'Benj', 'extra_fields': extra_fields, 'database': database }) dmat = get_pairwise_distances(contree) logging.info('Starting Dist Pvals ' + str(extra_fields)) check_distance_pvals.apply_async(args=(dmat, trop_dict), kwargs = {'nreps': 500}, link = benj_write_subtask)
def _process_customer(requester, customer, mailboxes, folders, users): if customer.id is None or (customer.emails is None and customer.fullname is None): # can't use customer with no data logger.debug("Customer '%s' for user '%s' cannot be used - no data", (customer.id or customer.fullname), requester.username) return db_customer, created = Document.objects.get_or_create( helpscout_customer_id=customer.id, requester=requester, user_id=requester.id) db_customer.helpscout_name = customer.fullname logger.debug("Processing Helpscout customer '%s' for user '%s'", customer.fullname, requester.username) new_updated = customer.modifiedat new_updated_ts = parse_dt(new_updated).timestamp() if not created and db_customer.last_updated_ts: new_updated_ts = db_customer.last_updated_ts \ if db_customer.last_updated_ts > new_updated_ts else new_updated_ts db_customer.last_updated = datetime.utcfromtimestamp( new_updated_ts).isoformat() + 'Z' db_customer.last_updated_ts = new_updated_ts db_customer.helpscout_title = 'User: {}'.format(customer.fullname) db_customer.webview_link = 'https://secure.helpscout.net/customer/{}/0/'.format( customer.id) db_customer.primary_keywords = HELPSCOUT_KEYWORDS['primary'] db_customer.secondary_keywords = HELPSCOUT_KEYWORDS['secondary'] db_customer.helpscout_company = customer.organization db_customer.helpscout_emails = ', '.join( e.get('value') for e in customer.emails if 'value' in e) if customer.emails else None db_customer.save() algolia_engine.sync(db_customer, add=created) subtask(process_customer).delay(requester, db_customer, mailboxes, folders, users)
def get_friends_for_user(self, fb_id, callback, next_uri=None): """ Get the facebook friends for the user with fb_id. 1. Needs a valid access_token in the cache 2. Needs 'user_friends' permission 3. Needs a callback function that can store the friends somewhere If 1. is not present, the task is delayed If 2. is not the case, you're out of luck """ access_token = get_cached_access_token(fb_id) if access_token is None: raise self.retry(exc=ValueError("Failed to fetch facebook data for %s. " "No access_token found in cache" % fb_id)) graph = facebook.GraphAPI(access_token) try: if next_uri: data = graph.bare_request(next_uri) else: data = graph.get_connections('me', 'friends', limit=500) except facebook.GraphAPIError as exc: raise self.retry(exc=exc) subtask(callback).delay(data['data']) if data['paging'].get('next'): self.delay(fb_id, callback, next_uri=data['paging']['next'])
def on_app_ready(sender=None, headers=None, body=None, **kwargs): if cache.get("CELERY_APP_READY", 0) == 1: return cache.set("CELERY_APP_READY", 1, 10) tasks = get_after_app_ready_tasks() logger.debug("Start need start task: [{}]".format(", ".join(tasks))) for task in tasks: subtask(task).delay()
def on_app_ready(sender=None, headers=None, body=None, **kwargs): if cache.get("CELERY_APP_READY", 0) == 1: return cache.set("CELERY_APP_READY", 1, 10) logger.debug("App ready signal recv") logger.debug("Start need start task: [{}]".format( ", ".join(__AFTER_APP_READY_RUN_TASKS))) for task in __AFTER_APP_READY_RUN_TASKS: subtask(task).delay()
def unlock_graph(result, callback, interval=1, propagate=False, max_retries=None): if result.ready(): second_level_res = result.get() if second_level_res.ready(): subtask(callback).delay(list(joinall( second_level_res, propagate=propagate))) else: unlock_graph.retry(countdown=interval, max_retries=max_retries)
def on_app_ready(sender=None, headers=None, **kwargs): if cache.get("CELERY_APP_READY", 0) == 1: return cache.set("CELERY_APP_READY", 1, 10) tasks = get_after_app_ready_tasks() logger.debug("Work ready signal recv") logger.debug("Start need start task: [{}]".format(", ".join(tasks))) for task in tasks: subtask(task).delay()
def on_app_ready(sender=None, headers=None, body=None, **kwargs): if cache.get("CELERY_APP_READY", 0) == 1: return cache.set("CELERY_APP_READY", 1, 10) logger.debug("App ready signal recv") logger.debug("Start need start task: [{}]".format( ", ".join(__AFTER_APP_READY_RUN_TASKS)) ) for task in __AFTER_APP_READY_RUN_TASKS: subtask(task).delay()
def on_chord_part_return(self, task, propagate=False): from celery import subtask from celery.result import TaskSetResult setid = task.request.taskset if not setid: return key = self.get_key_for_chord(setid) deps = TaskSetResult.restore(setid, backend=task.backend) if self.client.incr(key) >= deps.total: subtask(task.request.chord).delay(deps.join(propagate=propagate)) deps.delete() self.client.delete(key)
def _launch_all_scanning_subtasks_after_task_enum(self, jobs): err_handler = ScanRunErrorHandlerTask() result_handler = ScanRunResultHandlerTask() # filter out none jobs that we don't have to launch logr.debug("Enumerating scanning subtasks based on '{0}' jobs.".format(len(jobs))) filt_jobs = filter(lambda jobdb: jobdb[0] is not None, jobs) r = [job.apply_async(timeout=self.scan_timeout * 0.9, link_error=subtask(err_handler, queue=err_handler.queue), link=subtask(result_handler, args=(db_entry.task_id,), queue=result_handler.queue), ) for job, db_entry in filt_jobs] return r
def collect_files(requester, repo_id, repo_name, repo_url, default_branch, enrichment_delay): """ List all files in a repo - should be called once, after first sync of a repo. Subsequent syncing is handled via collect_commits() function. Note that this uses Github's API call for retrieval of recursive trees: https://developer.github.com/v3/git/trees/#get-a-tree-recursively This API call returns a flat list of all files and saves us many API calls that would be needed to recursively fetch files for each repo directory. But it may not work well for very big repos (> 5k files), becuase Github API has a limit of number of elements it will return in one call. """ github_client = init_github_client(requester) repo = github_client.get_repo(full_name_or_id=repo_name) new_files = [] for f in repo.get_git_tree(sha=repo.default_branch, recursive=True).tree: db_file, created = Document.objects.get_or_create( github_file_id=_compute_sha('{}{}'.format(repo_id, f.path)), github_repo_id=repo_id, requester=requester, user_id=requester.id) if created: new_files.append({ 'sha': f.sha, 'filename': f.path, 'action': 'modified', 'type': f.type }) db_file.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_file.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['file'] # set the timestamp to 0 (epoch) to signal that we don't know the update timestamp db_file.last_updated_ts = 0 db_file.last_updated = datetime.utcfromtimestamp( 0).isoformat() + 'Z' db_file.github_title = '{}: {}'.format( 'Dir' if f.type == 'tree' else 'File', f.path.split('/')[-1]) db_file.github_file_path = f.path db_file.github_repo_full_name = repo_name db_file.webview_link = '{}/blob/{}/{}'.format( repo_url, default_branch, f.path) algolia_engine.sync(db_file, add=created) db_file.last_synced = get_utc_timestamp() db_file.download_status = Document.PENDING db_file.save() # run enrich_files() for all new_files in chunks of 50 items i = 0 for ff in [new_files[x:x + 50] for x in range(0, len(new_files), 50)]: i = i + 1 subtask(enrich_files).apply_async( args=[requester, ff, repo.id, repo_name, repo_url, default_branch], countdown=enrichment_delay + (240 * i))
def update_synchronization(): """ Check for new/updated files in external systems for all users. Should be called periodically after initial syncing. Gdrive-only at the moment. """ logger.debug("Update synchronizations started") for sa in SocialAttributes.objects.filter(start_page_token__isnull=False): if should_sync(sa.user, 'google-oauth2', 'tasks.gdrive'): if sa.user.social_auth.filter(provider='google-oauth2').first(): access_token, refresh_token = get_google_tokens(sa.user) subtask(sync_gdrive_changes).delay(sa.user, access_token, refresh_token, sa.start_page_token) else: logger.info("Gdrive oauth token for user '%s' already in use, skipping sync ...", sa.user.username)
def run_ansible_task(tid, callback=None, **kwargs): """ :param tid: is the tasks serialized data :param callback: callback function name :return: """ task = get_object_or_none(Task, id=tid) if task: result = task.run() if callback is not None: subtask(callback).delay(result, task_name=task.name) return result else: logger.error("No task found")
def on_chord_part_return(self, task, propagate=None): if not self.implements_incr: return from celery import subtask from celery.result import GroupResult app = self.app if propagate is None: propagate = self.app.conf.CELERY_CHORD_PROPAGATES gid = task.request.group if not gid: return key = self.get_key_for_chord(gid) deps = GroupResult.restore(gid, backend=task.backend) if deps is None: callback = subtask(task.request.chord) return app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError('GroupResult {0} no longer exists'.format(gid)) ) val = self.incr(key) if val >= len(deps): callback = subtask(task.request.chord) j = deps.join_native if deps.supports_native_join else deps.join try: ret = j(propagate=propagate) except Exception as exc: try: culprit = next(deps._failed_join_report()) reason = 'Dependency {0.id} raised {1!r}'.format( culprit, exc, ) except StopIteration: reason = repr(exc) app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError(reason), ) else: try: callback.delay(ret) except Exception as exc: app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError('Callback error: {0!r}'.format(exc)), ) finally: deps.delete() self.client.delete(key) else: self.expire(key, 86400)
def rollout(self, data=None, callback=None): logger.info(f"Starting Rollout for {data['hostname']}") payload = { "deployment_id": data["deployment_id"], "versionlock": data["versionlock"], } http = get_http r = http.post( f"{data['protocol']}://{data['hostname']}:{data['port']}/api/{data['version']}/rollout", json=payload) result = r.json() if callback is not None: subtask(callback).delay(result) return result
def dmap(it, callback): """ Distributed Map function. Given an iterable of data and a task method, map the method over the given data. """ callback = subtask(callback) return group(callback.clone([arg,]) for arg in it)()
def updateAllPincodes(): '''Fetch new data for all pincodes currently in the database. Returns a group so that this update can be executed in parallel''' callback = subtask(updatePincode.s()) return group( callback.clone((pincode.code, )) for pincode in Pincode.query.filter( Pincode.subscriptions.any()).all())()
def on_chord_part_return(self, task, propagate=True): if not self.implements_incr: return from celery import subtask from celery.result import GroupResult gid = task.request.group if not gid: return key = self.get_key_for_chord(gid) deps = GroupResult.restore(gid, backend=task.backend) val = self.incr(key) if val >= len(deps): j = deps.join_native if deps.supports_native_join else deps.join callback = subtask(task.request.chord) try: ret = j(propagate=propagate) except Exception as exc: culprit = next(deps._failed_join_report()) self.app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError('Dependency %s raised %r' % ( culprit.id, exc)) ) else: callback.delay(ret) finally: deps.delete() self.client.delete(key) else: self.expire(key, 86400)
def updateAllSubscribers(): '''Notify new data for all subscriptions currently in the database. Returns a group so that this update can be executed in parallel''' callback = subtask(updateSubscriber.s()) return group( callback.clone((subscription.id, )) for subscription in Subscription.query.filter(Subscription.pincodes.any()).all())()
def get_tweets(): """Get some tweets from the twitter api and store them to the db.""" if not Tweet.objects.all(): # If the db is empty, don't get max_id. tweets = api.search( q='#python', count=100 ) else: # If the db is not empty, get max_id. subtask(clean_tweetdb) max_id = min([tweet.tweet_id for tweet in Tweet.objects.all()]) tweets = api.search( q='#python', max_id=max_id, count=100 ) # Store the tweet data in lists. tweets_id = [tweet.id for tweet in tweets] tweets_date = [tweet.created_at for tweet in tweets] tweets_source = [tweet.source for tweet in tweets] tweets_favorite_cnt = [tweet.favorite_count for tweet in tweets] tweets_retweet_cnt = [tweet.retweet_count for tweet in tweets] tweets_text = [tweet.text for tweet in tweets] # Iterate over these lists and add data to db. for i, j, k, l, m, n in zip( tweets_id, tweets_date, tweets_source, tweets_favorite_cnt, tweets_retweet_cnt, tweets_text, ): try: # Check that they are valid. Tweet.objects.create( tweet_id=i, tweet_date=j, tweet_source=k, tweet_favorite_cnt=l, tweet_retweet_cnt=m, tweet_text=n, ) except IntegrityError: pass
def callback_map(it, callback): logger.info(sys._getframe().f_code.co_name + " start") # ref:http://stackoverflow.com/questions/13271056/how-to-chain-a-celery-task-that-returns-a-list-into-a-group # Map a callback over an iterator and return as a group # print('it: ' + str(it)) callback = subtask(callback) return group(callback.clone([arg, ]) for arg in it)()
def on_chord_part_return(self, task, propagate=False): if not self.implements_incr: return from celery import subtask from celery.result import GroupResult gid = task.request.group if not gid: return key = self.get_key_for_chord(gid) deps = GroupResult.restore(gid, backend=task.backend) val = self.incr(key) if val >= len(deps): subtask(task.request.chord).delay(deps.join(propagate=propagate)) deps.delete() self.client.delete(key) else: self.expire(key, 86400)
def _launch_all_scanning_subtasks_after_task_enum(self, jobs): err_handler = ScanRunErrorHandlerTask() result_handler = ScanRunResultHandlerTask() # filter out none jobs that we don't have to launch logr.debug("Enumerating scanning subtasks based on '{0}' jobs.".format( len(jobs))) filt_jobs = filter(lambda jobdb: jobdb[0] is not None, jobs) r = [ job.apply_async( timeout=self.scan_timeout * 0.9, link_error=subtask(err_handler, queue=err_handler.queue), link=subtask(result_handler, args=(db_entry.task_id, ), queue=result_handler.queue), ) for job, db_entry in filt_jobs ] return r
def dmap(it, callback): """ Distributed Map function. Given an iterable of data and a task method, map the method over the given data. """ callback = subtask(callback) return group(callback.clone([ arg, ]) for arg in it)()
def execute_task(self, task_name, task_queue=None, kwargs=None, node_context=None, send_task_events=DEFAULT_SEND_TASK_EVENTS, total_retries=None, retry_interval=None): """ Execute a task :param task_name: the task named :param task_queue: the task queue, if None runs the task locally :param kwargs: optional kwargs to be passed to the task :param node_context: Used internally by node.execute_operation """ kwargs = kwargs or {} task_id = str(uuid.uuid4()) cloudify_context = self._build_cloudify_context( task_id, task_queue, task_name, node_context) kwargs['__cloudify_context'] = cloudify_context if task_queue is None: # Local task values = task_name.split('.') module_name = '.'.join(values[:-1]) method_name = values[-1] module = importlib.import_module(module_name) task = getattr(module, method_name) return self.local_task(local_task=task, info=task_name, name=task_name, kwargs=kwargs, task_id=task_id, send_task_events=send_task_events, total_retries=total_retries, retry_interval=retry_interval) else: # Remote task # Import here because this only applies to remote tasks execution # environment import celery task = celery.subtask(task_name, kwargs=kwargs, queue=task_queue, immutable=True) return self.remote_task(task=task, cloudify_context=cloudify_context, task_id=task_id, send_task_events=send_task_events, total_retries=total_retries, retry_interval=retry_interval)
def group_tasks(it, callback): """ 这个方法是为了结合chain和group :param it: :param callback: :return: """ callback = subtask(callback) result = group(callback.clone((args, )) for args in it)() return result
def dmap(self, it, callback): """ Map a callback over an iterator and return as a group """ callback = subtask(callback) tasks = group([callback.clone([arg,]) for arg in it]) _tasks = tasks() ids = map(lambda x: x, _tasks) app.backend.mark_as_started(self.request.id, **{'__subtasks': ids}) return _tasks
def dmap(it, callback): ''' Map a callback over an iterator and return as a group args: it: list/tuple/gen: input iterator callback: celery.Task: function to apply for each item in it return: celery.group: ... ''' callback = subtask(callback) return group(callback.clone([arg,]) for arg in it)()
def dmap(args_iter, celery_task): """ Takes an iterator of argument tuples and queues them up for celery to run with the function. """ callback = subtask(celery_task) if isinstance(args_iter, list): run_in_parallel = group( clone_signature(callback, args=(args, )) for args in args_iter) elif isinstance(args_iter, dict): run_in_parallel = group(clone_signature(callback, kwargs=args_iter)) return run_in_parallel.delay()
def get_tweets(): """Get some tweets from the twitter api and store them to the db.""" if not Tweet.objects.all(): # If the db is empty, don't get max_id. tweets = api.search(q='#python', count=100) else: # If the db is not empty, get max_id. subtask(clean_tweetdb) max_id = min([tweet.tweet_id for tweet in Tweet.objects.all()]) tweets = api.search(q='#python', max_id=max_id, count=100) # Store the tweet data in lists. tweets_id = [tweet.id for tweet in tweets] tweets_date = [tweet.created_at for tweet in tweets] tweets_source = [tweet.source for tweet in tweets] tweets_favorite_cnt = [tweet.favorite_count for tweet in tweets] tweets_retweet_cnt = [tweet.retweet_count for tweet in tweets] tweets_text = [tweet.text for tweet in tweets] # Iterate over these lists and add data to db. for i, j, k, l, m, n in zip( tweets_id, tweets_date, tweets_source, tweets_favorite_cnt, tweets_retweet_cnt, tweets_text, ): try: # Check that they are valid. Tweet.objects.create( tweet_id=i, tweet_date=j, tweet_source=k, tweet_favorite_cnt=l, tweet_retweet_cnt=m, tweet_text=n, ) except IntegrityError: pass
def on_chord_part_return(self, task, propagate=None): if not self.implements_incr: return from celery import subtask from celery.result import GroupResult app = self.app if propagate is None: propagate = self.app.conf.CELERY_CHORD_PROPAGATES gid = task.request.group if not gid: return key = self.get_key_for_chord(gid) deps = GroupResult.restore(gid, backend=task.backend) val = self.incr(key) if val >= len(deps): j = deps.join_native if deps.supports_native_join else deps.join callback = subtask(task.request.chord) try: ret = j(propagate=propagate) except Exception as exc: try: culprit = next(deps._failed_join_report()) reason = 'Dependency {0.id} raised {1!r}'.format( culprit, exc, ) except StopIteration: reason = repr(exc) app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError(reason), ) else: try: callback.delay(ret) except Exception as exc: app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError('Callback error: {0!r}'.format(exc)), ) finally: deps.delete() self.client.delete(key) else: self.expire(key, 86400)
def add_job(self, job, job_id=None): """ Add a job (and its tasks) to the queue and update the monitoring counters """ if not job_id: job_id = job["id"] self.add_tasks(job["tasks"], job_id) self.jqueuer_job_added_count += 1 monitoring.add_job(self.experiment_id, self.service_name, job_id) job_queue_id = ("j_" + self.service_name + "_" + str(int(round(time.time() * 1000))) + "_" + str(random.randrange(100, 999))) chain = subtask("job_operations.add", queue=JOB_QUEUE_PREFIX + self.service_name) chain.delay(self.experiment_id, job_queue_id, job)
def add_job(self, job, job_id=None): if (not job_id): job_id = job["id"] self.add_tasks(job['tasks'], job_id) self.jqueuer_job_added_count += 1 monitoring.add_job(self.experiment_id, self.service_name, job_id) job_queue_id = "j_" + self.service_name + "_" + str( int(round(time.time() * 1000))) + "_" + str( random.randrange(100, 999)) chain = subtask('job_operations.add', queue=JOB_QUEUE_PREFIX + self.service_name) chain.delay(self.experiment_id, job_queue_id, job) self.add_log( "The job_id {} ,job_queue_id: {} - ,JOB_QUEUE_PREFIX: {}has just been added" .format(str(job_id), str(job_queue_id), str(JOB_QUEUE_PREFIX)))
def get_task(self, workflow_task, queue=None, target=None): runtime_props = [] def _derive(property_name): executor = workflow_task.cloudify_context['executor'] host_id = workflow_task.cloudify_context['host_id'] if executor == 'host_agent': if len(runtime_props) == 0: host_node_instance = get_node_instance(host_id) cloudify_agent = host_node_instance.runtime_properties.get( 'cloudify_agent') if not cloudify_agent: raise exceptions.NonRecoverableError( 'Missing cloudify_agent runtime information. ' 'This most likely means that the Compute node ' 'never started successfully') runtime_props.append(cloudify_agent) return runtime_props[0][property_name] return self.workflow_ctx.deployment.id if queue is None: queue = _derive('queue') if target is None: target = _derive('name') kwargs = workflow_task.kwargs # augment cloudify context with target and queue kwargs['__cloudify_context']['task_queue'] = queue kwargs['__cloudify_context']['task_target'] = target # Remote task # Import here because this only applies to remote tasks execution # environment import celery return celery.subtask(workflow_task.name, kwargs=kwargs, queue=queue, immutable=True), queue, target
def test_on_chord_part_return(self, restore): b = self.MockBackend(app=self.app) deps = Mock() deps.__len__ = Mock() deps.__len__.return_value = 10 restore.return_value = deps b.client.incr.return_value = 1 task = Mock() task.name = 'foobarbaz' self.app.tasks['foobarbaz'] = task task.request.chord = subtask(task) task.request.group = 'group_id' b.on_chord_part_return(task) self.assertTrue(b.client.incr.call_count) b.client.incr.return_value = len(deps) b.on_chord_part_return(task) deps.join_native.assert_called_with(propagate=True) deps.delete.assert_called_with() self.assertTrue(b.client.expire.call_count)
def test_on_chord_part_return(self, restore): tb = CacheBackend(backend='memory://', app=self.app) deps = Mock() deps.__len__ = Mock() deps.__len__.return_value = 2 restore.return_value = deps task = Mock() task.name = 'foobarbaz' self.app.tasks['foobarbaz'] = task task.request.chord = subtask(task) gid, res = uuid(), [self.app.AsyncResult(uuid()) for _ in range(3)] task.request.group = gid tb.on_chord_apply(gid, {}, result=res) self.assertFalse(deps.join_native.called) tb.on_chord_part_return(task) self.assertFalse(deps.join_native.called) tb.on_chord_part_return(task) deps.join_native.assert_called_with(propagate=True) deps.delete.assert_called_with()
def on_chord_part_return(self, task, propagate=None): if not self.implements_incr: return from celery import subtask from celery.result import GroupResult app = self.app if propagate is None: propagate = self.app.conf.CELERY_CHORD_PROPAGATES gid = task.request.group if not gid: return key = self.get_key_for_chord(gid) deps = GroupResult.restore(gid, backend=task.backend) val = self.incr(key) if val >= len(deps): j = deps.join_native if deps.supports_native_join else deps.join callback = subtask(task.request.chord) try: ret = j(propagate=propagate) except Exception, exc: try: culprit = deps._failed_join_report().next() reason = 'Dependency %s raised %r' % (culprit.id, exc) except StopIteration: reason = repr(exc) app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError(reason), ) else: try: callback.delay(ret) except Exception, exc: app._tasks[callback.task].backend.fail_from_current_stack( callback.id, exc=ChordError('Callback error: %r' % (exc, )), )
def dmap(it, callback): # http://stackoverflow.com/questions/13271056/how-to-chain-a-celery-task-that-returns-a-list-into-a-group # Map a callback over an iterator and return as a group callback = subtask(callback) return group(callback.clone((arg,)) for arg in it)()
def dmap(it, callback): # Map a callback over an iterator and return as a group callback = subtask(callback) return group(callback.clone((arg, )) for arg in it)()
def video_map(videos_list, processing_callback, link): callback = subtask(processing_callback) return group(callback.clone([arg,], link=link) for arg in videos_list if arg)()
def hello(name, callback=None): print("Hello {}".format(name)) if callback is not None: subtask(callback).delay("Guahongwei")
from celery import subtask from celery import group, chain, chord, chunks from proj import tasks # Get subtask ## method 1 sub1 = subtask(tasks.add, args=(2, 2), countdown=1) res = sub1.apply_async() #print res.get() ## method 2 sub = tasks.add.subtask((2, 2), countdown=1) res = sub.apply_async() #print res.get() ## method 3 #print tasks.add.s(2, 2).set(countdown=1).apply_async().get() # Partials partial = tasks.add.s(2) #print partial.delay(2).get() # Immutbility #sub = tasks.add.apply_async((2, 2), link=tasks.add_callback_noarg.subtask(immutable=True)) sub = tasks.add.apply_async((2, 2), link=tasks.add_callback_noarg.si()) #sub.get() # Callbacks sub = tasks.add.apply_async((2, 2), link=tasks.minus.s(3))
def process_gdrive_docs(requester, access_token, refresh_token, files_fn, json_key): service = connect_to_gdrive(access_token, refresh_token) folders = {} page_token = None new_start_page_token = None while True: files = files_fn(service, page_token) new_start_page_token = files.get('newStartPageToken', new_start_page_token) items = files.get(json_key, []) if not folders and len(items) > 0: # retrieve all folders to be able to get file path more easily in the file listing(s) logger.debug("Getting folders for %s/%s", requester.id, requester.username) folders = get_gdrive_folders(service) # check if any folder was marked as hidden and we already have it synced ... # if we do, then remove it (plus all children) from our indexing for folder_id, folder in folders.items(): if folder.get('hidden') is True: desync_folder(folder.get('id'), folders, requester, service) for item in items: if 'file' in item: item = item['file'] # check for ignored mime types if any(x.match(item.get('mimeType', '')) for x in IGNORED_MIMES): continue parents = item.get('parents', []) hidden = is_hidden(item.get('description')) or any(is_hidden_in_folder(f, folders) for f in parents) if item.get('trashed') or hidden: # file was removed or hidden Document.objects.filter( document_id=item['id'], requester=requester, user_id=requester.id ).delete() continue # handle file path within gdrive parent = parents[0] if parents else None path = get_gdrive_path(parent, folders) doc, created = get_or_create( model=Document, document_id=item['id'], requester=requester, user_id=requester.id ) doc.mime_type = item.get('mimeType').lower() doc.title = item.get('name') doc.webview_link = item.get('webViewLink') doc.icon_link = item.get('iconLink') doc.thumbnail_link = item.get('thumbnailLink') doc.last_updated = item.get('modifiedTime') doc.path = path last_modified_on_server = parse_date(doc.last_updated) doc.last_updated_ts = last_modified_on_server.timestamp() doc.modifier_display_name = item.get('lastModifyingUser', {}).get('displayName') doc.modifier_photo_link = item.get('lastModifyingUser', {}).get('photoLink') doc.owner_display_name = item['owners'][0]['displayName'] doc.owner_photo_link = item.get('owners', [{}])[0].get('photoLink') doc.primary_keywords = GDRIVE_KEYWORDS['primary'] doc.secondary_keywords = GDRIVE_KEYWORDS['secondary'][doc.mime_type] \ if doc.mime_type in GDRIVE_KEYWORDS['secondary'] else None can_download = item.get('capabilities', {}).get('canDownload', True) if can_download: # check also the mime type as we only support some of them if not any(x for x in EXPORTABLE_MIMES if doc.mime_type.startswith(x)): can_download = False if can_download: if not created: if doc.download_status is Document.READY and can_download and \ (doc.last_synced is None or last_modified_on_server > doc.last_synced): doc.download_status = Document.PENDING subtask(download_gdrive_document).delay(doc, access_token, refresh_token) else: algolia_engine.sync(doc, add=created) subtask(download_gdrive_document).delay(doc, access_token, refresh_token) else: doc.download_status = Document.READY doc.last_synced = get_utc_timestamp() doc.save() algolia_engine.sync(doc, add=False) doc.save() page_token = files.get('nextPageToken') if not page_token: break return new_start_page_token
def map_signature_chain(args_list, *signatures): return group([ chain( subtask(signatures[0]).clone((args, )), *(subtask(sig) for sig in signatures[1:])) for args in args_list ])
def callback_list_map(lists, callback): logger.info(sys._getframe().f_code.co_name + " start") callback = subtask(callback) return group(callback.clone([arg, ]) for arg in lists.get())()
def dmap(it, callback): # http://stackoverflow.com/questions/13271056/how-to-chain-a-celery-task-that-returns-a-list-into-a-group # Map a callback over an iterator and return as a group callback = subtask(callback) return group(callback.clone((arg, )) for arg in it)()
def dmap(it, callback): callback = subtask(callback) return group(callback.clone([arg,]) for arg in it)()
def map_single_task(args_list, *signatures): return group( [subtask(signatures[0]).clone((args, )) for args in args_list])
def add_cb(x, y, callback=None): result = x + y if callback: return subtask(callback).apply_async(result) return result
def collect_repos(requester): github_client = init_github_client(requester) # simple check if we are approaching api rate limits if github_client.rate_limiting[0] < 500: logger.debug( "Skipping github repos sync for user '%s' due to rate limits", requester.username) return i = 0 for repo in github_client.get_user().get_repos(): if not (repo.id or repo.full_name): logger.debug("Skipping github repo '%s' for user '%s'", repo.full_name, requester.username) # seems like broken data, skip it continue if repo.fork: # don't process forked repos logger.debug("Skipping forked github repo '%s' for user '%s'", repo.full_name, requester.username) continue db_repo, created = Document.objects.get_or_create( github_repo_id=repo.id, github_commit_id__isnull=True, github_file_id__isnull=True, github_issue_id__isnull=True, requester=requester, user_id=requester.id) db_repo.primary_keywords = GITHUB_PRIMARY_KEYWORDS db_repo.secondary_keywords = GITHUB_SECONDARY_KEYWORDS['repo'] db_repo.github_title = 'Repo: {}'.format(repo.name) db_repo.github_repo_owner = repo.owner.login db_repo.github_repo_description = repo.description logger.debug("Processing github repo '%s' for user '%s'", repo.full_name, requester.username) commit_count = 0 contributors = [] try: # fetch contributors for cnt in repo.get_contributors(): commit_count = commit_count + cnt.contributions if len(contributors) <= 10: contributors.append({ 'name': cnt.name, 'url': cnt.html_url, 'avatar': cnt.avatar_url }) except UnknownObjectException: # most probably, this repo is disabled if created: logger.debug("Removing github repo '%s' for user '%s'", repo.full_name, requester.username) db_repo.delete() continue db_repo.github_repo_commit_count = commit_count db_repo.github_repo_contributors = contributors db_repo.github_repo_full_name = repo.full_name new_timestamp = max(repo.updated_at, repo.pushed_at) if created or new_timestamp.timestamp() > (db_repo.last_updated_ts or 0): i = i + 1 db_repo.last_updated_ts = new_timestamp.timestamp() db_repo.last_updated = new_timestamp.isoformat() + 'Z' db_repo.webview_link = repo.html_url # fetch readme file try: readme = repo.get_readme() readme_content = cut_utf_string(readme.decoded_content.decode( 'UTF-8', errors='replace'), 9000, step=100) md = github_client.render_markdown(text=readme_content).decode( 'UTF-8', errors='replace') # also replace <em> tags, because they are used by Algolia highlighting db_repo.github_repo_content = md.replace('<em>', '<b>').replace( '</em>', '</b>') db_repo.github_repo_readme = readme.name except UnknownObjectException: # readme does not exist db_repo.github_repo_content = None algolia_engine.sync(db_repo, add=created) if created: # sync files subtask(collect_files).delay(requester, repo.id, repo.full_name, repo.html_url, repo.default_branch, enrichment_delay=i * 300) # sync commits subtask(collect_commits).apply_async(args=[ requester, repo.id, repo.full_name, repo.html_url, repo.default_branch, commit_count ], countdown=240 * i if created else 1) # sync issues subtask(collect_issues).apply_async( args=[requester, repo.id, repo.full_name, created], countdown=180 * i if created else 1) db_repo.last_synced = get_utc_timestamp() db_repo.download_status = Document.READY db_repo.save()
def collect_boards(requester): trello_client = init_trello_client(requester) orgs = dict() for board in trello_client.list_boards(board_filter='open,closed'): db_board, created = Document.objects.get_or_create( trello_board_id=board.id, trello_card_id__isnull=True, requester=requester, user_id=requester.id ) board_last_activity = board.raw.get('dateLastActivity') if not board_last_activity: # this nasty hack is needed, becuse some Trello boards don't have 'dateLastActivity' timestamp # -> looks like it's those boards that have been inactive for some time if not created: board_last_activity = db_board.last_updated.isoformat() else: # Trello was established in 2011, so we use 01.01.2011 as epoch actions = board.fetch_actions(action_filter='all', action_limit=1, since='2011-01-01T00:00:00.000Z') if actions: board_last_activity = actions[0].get('date') last_activity = parse_dt(board_last_activity).isoformat() last_activity_ts = int(parse_dt(board_last_activity).timestamp()) if not created and db_board.download_status == Document.READY and \ (db_board.last_updated_ts and db_board.last_updated_ts >= last_activity_ts): logger.debug("Trello board '%s' for user '%s' hasn't changed", board.name[:50], requester.username) continue logger.debug("Processing board '%s' for user '%s'", board.name[:50], requester.username) db_board.primary_keywords = TRELLO_PRIMARY_KEYWORDS db_board.secondary_keywords = TRELLO_SECONDARY_KEYWORDS['board'] db_board.last_updated = last_activity db_board.last_updated_ts = last_activity_ts db_board.trello_title = 'Board: {}'.format(board.name) db_board.webview_link = board.url db_board._trello_description = board.description db_board.trello_board_status = 'Closed' if board.closed else 'Open' orgId = board.raw.get('idOrganization') if orgId and orgId not in orgs: try: org = trello_client.get_organization(orgId).raw orgs[orgId] = { 'name': org.get('displayName'), 'logo': 'https://trello-logos.s3.amazonaws.com/{}/30.png'.format(orgId), 'url': org.get('url') } except ResourceUnavailable: # defunct/deleted organization, assume that board is personal orgId = None db_board.trello_board_org = orgs[orgId] if orgId else None build_list = lambda l: { 'id': l.id, 'name': l.name, 'closed': l.closed, 'pos': l.pos } all_lists = {l.id: build_list(l) for l in board.all_lists()} db_board.trello_content = { 'description': _to_html(board.description), 'lists': sorted( filter(lambda x: not x.get('closed'), all_lists.values()), key=itemgetter('pos') ) } build_member = lambda m: { 'name': m.full_name, 'url': m.url, 'avatar': 'https://trello-avatars.s3.amazonaws.com/{}/30.png'.format(m.avatar_hash) } all_members = {m.id: build_member(m) for m in board.all_members()} db_board.trello_board_members = list(all_members.values()) db_board.last_synced = get_utc_timestamp() db_board.download_status = Document.READY db_board.save() algolia_engine.sync(db_board, add=created) subtask(collect_cards).delay(requester, db_board, board.name, all_members, all_lists) # add sleep of 30s to avoid breaking api limits time.sleep(30)
def dmap(iter, callback): # Map a callback over an iterator and return as a group callback = subtask(callback) return group(callback.clone([arg,]) for arg in iter)()