def create_and_upload_archive(self, src_url, key): """ A celery task that downloads an archive if it exists from a src location and attempts to upload the archive to a supported bucket in each supported region. Throughout this process, update the state of the task and finally return the location of the s3 urls if successful. expires after 30m if the task hasn't been picked up from the message queue task is killed if exceeds time_limit of an hour after it has started """ status = "" s3_urls = {} buckets = current_app.config['ARCHIVER_S3_BUCKETS'] try: s3_urls, status = upload_url_archive_to_s3(key, src_url, buckets) except Exception as exc: # set a jitter enabled delay # where an aggressive delay would result in: 7s, 49s, and 343s # and a gentle delay would result in: 4s, 16s, and 64s delay = randint(4, 7) ** (current.request.retries + 1) # retries == 0 on first attempt current.retry(exc=exc, countdown=delay) return { 'status': status, 'src_url': src_url, 's3_urls': s3_urls, }
def update_associated_submissions(database_name, form_model_id, deleted_question_codes): try: manager = get_db_manager(database_name) #update_submissions_for_form_code_change(manager, new_form_code, old_form_code) remove_deleted_questions_from_submissions(manager, form_model_id, deleted_question_codes) except Exception as e: current.retry(exc=e)
def create_and_upload_archive(self, src_url, key): """ A celery task that downloads an archive if it exists from a src location and attempts to upload the archive to a supported bucket in each supported region. Throughout this process, update the state of the task and finally return the location of the s3 urls if successful. expires after 30m if the task hasn't been picked up from the message queue task is killed if exceeds time_limit of an hour after it has started """ status = "" s3_urls = {} buckets = current_app.config['ARCHIVER_S3_BUCKETS'] try: s3_urls, status = upload_url_archive_to_s3(key, src_url, buckets) except Exception as exc: # set a jitter enabled delay # where an aggressive delay would result in: 7s, 49s, and 343s # and a gentle delay would result in: 4s, 16s, and 64s delay = randint(4, 7)**(current.request.retries + 1 ) # retries == 0 on first attempt current.retry(exc=exc, countdown=delay) return { 'status': status, 'src_url': src_url, 's3_urls': s3_urls, }
def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id ) current.retry( countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def update_associated_submissions(database_name, old_form_code, new_form_code, deleted_question_codes): try: manager = get_db_manager(database_name) update_submissions_for_form_code_change(manager, new_form_code, old_form_code) update_submissions_for_form_field_change(manager, old_form_code, deleted_question_codes) except Exception as e: current.retry(exc=e)
def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id) current.retry( countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def wrapped(*args, **kwargs): try: return func(*args, **kwargs) except exclude: raise except on as exc: capture_exception() current.retry(exc=exc)
def wrapped(*args, **kwargs): try: return func(*args, **kwargs) except exclude: raise except on as exc: Raven.captureException() current.retry(exc=exc)
def download(url, filesize = None, localFileName = None): try: global dstdir if localFileName != None: localName = localFileName else: localName = join(dstdir, urlsplit(url).path[1:]) if splitext(localName)[1].lower() in exclude_exts: logger.info("exclude file: %s, skip", localName) return True req = urllib2.Request(url) r = urllib2.urlopen(req) file_len = int(r.headers["Content-Length"]) if filesize is not None and filesize != file_len: logger.error("filesize(%s) != file_len(%s): %s", filesize, file_len, url) if file_len == 0: raise Exception("Get file_len=0 from nginx, retry again ") filesize = file_len if os.path.exists(localName) and splitext(localName)[1].lower() not in special_exts: if getsize(localName) == filesize: logger.info("File \'%s\' existed and filesize(%s) equals, skip", url, filesize) r.close() return True else: logger.error("File \'%s\' existed, but file_len(%s) != local_filesize(%s), redownload", url, filesize, getsize(localName)) dstdirname = dirname(localName) if not os.path.exists(dstdirname): os.makedirs(dstdirname) block_sz = 8192*2 with open(localName, 'wb') as f: while True: buffer = r.read(block_sz) if not buffer: break f.write(buffer) r.close() sz = getsize(localName) if sz != filesize: logger.error("download %s unfinished: filesz:%s != localfilesz:%s" % (url, filesize, sz)) raise Exception("download %s unfinished: filesz:%s != localfilesz:%s" % (url, filesize, sz)) logger.info("down: %s to %s, filesize=%s, OK", url, localName, sz) return True except Exception, exc: logger.info("down: %s to %s failed: %s", url, localName, exc) if isinstance(exc, urllib2.HTTPError) and exc.code == 404: return True current.retry(exc=exc, countdown=min(2 ** current.request.retries, 360))
def update_single_search_index_item(full_path): try: return do_update(full_path) except Exception, exc: print 'Update search index got exception' print exc traceback.print_exc(exc) # exponential retry backoff, in seconds: 1, 2, 4, 16, 32, 64, 128 current.retry(exc=exc, countdown=min(2 ** current.request.retries, 128))
def cache_thumbnail(id, upload_to_s3=False, marker=True): try: import shutil print settings.MAPBOX_ACCESS_TOKEN from vida.firestation.models import FireDepartment department = FireDepartment.objects.get(id=id) filename = department.thumbnail_name generate_thumbnail = department.generate_thumbnail(marker=marker) if not marker: filename = department.thumbnail_name_no_marker full_filename = os.path.join('/home/vida/department-thumbnails', filename) if not generate_thumbnail.startswith('/static'): f = download_file(generate_thumbnail, full_filename.replace('jpg', 'png')) full_filename = convert_png_to_jpg(f) else: shutil.copy( '/webapps/vida/vida/vida/firestation/static/firestation/theme/assets/images/content/property-1.jpg', full_filename) if upload_to_s3: c = boto.s3.connect_to_region( 'us-east-1', aws_access_key_id=getattr(settings, 'AWS_ACCESS_KEY_ID', None), aws_secret_access_key=getattr(settings, 'AWS_SECRET_ACCESS_KEY', None), is_secure=True, calling_format=boto.s3.connection.OrdinaryCallingFormat(), debug=2) b = c.get_bucket('vida-static/department-thumbnails', validate=False) mtype = mimetypes.guess_type( filename)[0] or 'application/octet-stream' headers = { 'Content-Type': mtype, 'Cache-Control': 'max-age=%d, public' % (3600 * 24) } singlepart_upload(b, key_name=filename, fullpath=full_filename, policy='public-read', reduced_redundancy=False, headers=headers) except Exception as exc: if current.request.retries < 3: current.retry(exc=exc, countdown=min(2**current.request.retries, 128))
def async_populate_submission_index(db_name, form_code): try: try: dbm = get_db_manager(db_name) from datawinners.search.manage_index import populate_submission_index populate_submission_index(dbm, form_code) except Exception as e: current.retry(exc=e) except Exception as e: logger = logging.getLogger('tasks') logger.exception('Failed for db: %s ,form code: %s' % (db_name, form_code)) logger.exception(e)
def _populate_submission_index(db_name, form_model_id): logger = logging.getLogger('datawinners.tasks') try: try: dbm = get_db_manager(db_name) from datawinners.search.manage_index import populate_submission_index populate_submission_index(dbm, form_model_id) _clear_index_cache(dbm) except Exception as e: current.retry(exc=e) except Exception as e: logger.exception('Failed for db: %s ,form model id: %s' % (db_name, form_model_id)) logger.exception(e)
def _retry(self, exc=None, max_time=(60 * 60 * 3), max_countdown=(60 * 60), kwargs=None): request = getattr(current, 'request', None) if request is None: warning('would retry, but it is not inside running task context') return celery.exceptions.Retry('dummy retry after: %s' % exc) countdown = min(2 ** current.request.retries, max_countdown) max_retries = 0 time_counter = 0 while time_counter < max_time: time_counter += min(2 ** max_retries, max_countdown) max_retries += 1 args_new = current.request.args if args_new is None: args_new = [] kwargs_new = current.request.kwargs if kwargs_new is None: kwargs_new = {} if kwargs: kwargs_new.update(kwargs) warning('retrying %s(%s)', current, ', '.join(list(map(repr, args_new)) + ['%s=%r' % (k, v) for k, v in kwargs_new.items()])) return current.retry(exc=exc, countdown=countdown, max_retries=max_retries, args=args_new, kwargs=kwargs_new)
def check_instance_status(kwargs): nova = Client(kwargs['version'], kwargs['username'], kwargs['password'], kwargs['project'], kwargs['endpoint']) instance = nova.servers.get(kwargs['instance_id']) if instance.status == "BUILD": try: raise Exception("Still building") except Exception as e: interval = min(10 * (2 ** current.request.retries), 1800) raise current.retry(args=[kwargs], exc=e, countdown=interval, max_retries=8) else: kwargs['instance_status'] = instance.status record = Record.query.\ filter_by(instance_id=kwargs['instance_id']).first() record.instance_status = kwargs['instance_status'] try: db.session.commit() if kwargs['email_addr']: send_mail(kwargs['email_addr'], task_id=record.task_id, instance_id=record.instance_id, instance_status=record.instance_status, memo=record.memo) except: db.session.rollback() raise return kwargs
def process_resource_change(action, sender, instance_id, *args, **kwargs): # The class is serialized as a string when enqueueing the class. model = TYPES[sender] # Some resources are named differently than their model. eg. Group vs # Issue. Looks up the human name for the model. Defaults to the model name. name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower()) # We may run into a race condition where this task executes before the # transaction that creates the Group has committed. try: instance = model.objects.get(id=instance_id) except model.DoesNotExist as e: # Explicitly requeue the task, so we don't report this to Sentry until # we hit the max number of retries. return current.retry(exc=e) event = '{}.{}'.format(name, action) if event not in VALID_EVENTS: return org = None if isinstance(instance, Group): org = instance.organization installations = filter( lambda i: event in i.sentry_app.events, org.sentry_app_installations.select_related('sentry_app'), ) for installation in installations: send_webhooks(installation, event, data=serialize(instance))
def retry_task_noargs(**kwargs): current.iterations += 1 retries = kwargs["task_retries"] if retries >= 3: return 42 else: return current.retry(countdown=0)
def retry_task_noargs(**kwargs): current.iterations += 1 retries = kwargs['task_retries'] if retries >= 3: return 42 else: raise current.retry(countdown=0)
def retry_task(arg1, arg2, kwarg=1, max_retries=None, care=True): current.iterations += 1 rmax = current.max_retries if max_retries is None else max_retries retries = current.request.retries if care and retries >= rmax: return arg1 else: return current.retry(countdown=0, max_retries=rmax)
def process_resource_change(action, sender, instance_id, *args, **kwargs): model = None name = None # Previous method signature. if inspect.isclass(sender): model = sender else: model = TYPES[sender] name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower()) # We may run into a race condition where this task executes before the # transaction that creates the Group has committed. try: instance = model.objects.get(id=instance_id) except model.DoesNotExist as e: # Explicitly requeue the task, so we don't report this to Sentry until # we hit the max number of retries. return current.retry(exc=e) event = '{}.{}'.format(name, action) if event not in ALLOWED_EVENTS: return project = None if isinstance(instance, Group): project = instance.project if not project: return servicehooks = ServiceHook.objects.filter( project_id=project.id, ) for servicehook in filter(lambda s: event in s.events, servicehooks): # For now, these ``post_save`` callbacks are only valid for service # hooks created by a Sentry App. if not servicehook.created_by_sentry_app: continue request_data = AppPlatformEvent( resource=name, action=action, install=SentryAppInstallation.objects.get(id=servicehook.actor_id), data=serialize(instance), ) safe_urlopen( url=servicehook.url, data=request_data.body, headers=request_data.headers, timeout=5, )
def retry_task_mockapply(arg1, arg2, kwarg=1, **kwargs): current.iterations += 1 retries = kwargs['task_retries'] if retries >= 3: return arg1 else: kwargs.update(kwarg=kwarg) raise current.retry(countdown=0)
def retry_task_mockapply(arg1, arg2, kwarg=1, **kwargs): current.iterations += 1 retries = kwargs["task_retries"] if retries >= 3: return arg1 else: kwargs.update(kwarg=kwarg) return current.retry(countdown=0)
def cache_thumbnail(id, upload_to_s3=False, marker=True): try: import shutil print settings.MAPBOX_ACCESS_TOKEN from firecares.firestation.models import FireDepartment department = FireDepartment.objects.get(id=id) filename = department.thumbnail_name generate_thumbnail = department.generate_thumbnail(marker=marker) if not marker: filename = department.thumbnail_name_no_marker full_filename = os.path.join('/home/firecares/department-thumbnails', filename) if not generate_thumbnail.startswith('/static'): f = download_file(generate_thumbnail, full_filename.replace('jpg', 'png')) full_filename = convert_png_to_jpg(f) else: shutil.copy('/webapps/firecares/firecares/firecares/firestation/static/firestation/theme/assets/images/content/property-1.jpg', full_filename) if upload_to_s3: c = boto.s3.connect_to_region('us-east-1', aws_access_key_id=getattr(settings, 'AWS_ACCESS_KEY_ID', None), aws_secret_access_key=getattr(settings, 'AWS_SECRET_ACCESS_KEY', None), is_secure=True, calling_format=boto.s3.connection.OrdinaryCallingFormat(), debug=2 ) b = c.get_bucket('firecares-static/department-thumbnails', validate=False) mtype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' headers = {'Content-Type': mtype, 'Cache-Control': 'max-age=%d, public' % (3600 * 24)} singlepart_upload(b, key_name=filename, fullpath=full_filename, policy='public-read', reduced_redundancy=False, headers=headers) except Exception as exc: if current.request.retries < 3: current.retry(exc=exc, countdown=min(2 ** current.request.retries, 128))
def process_resource_change(action, sender, instance_id, *args, **kwargs): model = None name = None # Previous method signature. if inspect.isclass(sender): model = sender else: model = TYPES[sender] name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower()) # We may run into a race condition where this task executes before the # transaction that creates the Group has committed. try: instance = model.objects.get(id=instance_id) except model.DoesNotExist as e: # Explicitly requeue the task, so we don't report this to Sentry until # we hit the max number of retries. return current.retry(exc=e) event = '{}.{}'.format(name, action) if event not in ALLOWED_EVENTS: return project = None if isinstance(instance, Group): project = instance.project if not project: return servicehooks = ServiceHook.objects.filter(project_id=project.id, ) for servicehook in filter(lambda s: event in s.events, servicehooks): # For now, these ``post_save`` callbacks are only valid for service # hooks created by a Sentry App. if not servicehook.created_by_sentry_app: continue request_data = AppPlatformEvent( resource=name, action=action, install=SentryAppInstallation.objects.get(id=servicehook.actor_id), data=serialize(instance), ) safe_urlopen( url=servicehook.url, data=request_data.body, headers=request_data.headers, timeout=5, )
def retry_task_customexc(arg1, arg2, kwarg=1, **kwargs): current.iterations += 1 retries = kwargs["task_retries"] if retries >= 3: return arg1 + kwarg else: try: raise MyCustomException("Elaine Marie Benes") except MyCustomException, exc: kwargs.update(kwarg=kwarg) return current.retry(countdown=0, exc=exc)
def retry_task_customexc(arg1, arg2, kwarg=1, **kwargs): current.iterations += 1 retries = kwargs['task_retries'] if retries >= 3: return arg1 + kwarg else: try: raise MyCustomException('Elaine Marie Benes') except MyCustomException as exc: kwargs.update(kwarg=kwarg) raise current.retry(countdown=0, exc=exc)
def index_for_search(ebook_id=None, ebook_data=None): """ Add ebook to the Whoosh search index """ with app.app_context(): if ebook_id and not ebook_data: ds = DataStore(app.config, app.logger) ebook_data = ds.load_ebook(ebook_id) elif not ebook_data: raise Exception('index_for_search task called without ebook_id or ebook_data params') try: # create search class and index search = Search(init_whoosh(app), pagelen=app.config.get('SEARCH_PAGELEN', 20)) search.index_for_search(ebook_data) except whoosh.writing.LockError: # if index is unavailable try again in 10 secs current.retry( kwargs={'ebook_id': ebook_id, 'ebook_data': ebook_data}, countdown=1, )
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info( '[BTMClassification] Got sample %d for classification.' % sample_id ) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id ) current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def upload(bucket_name='androidpackage', filename=None, body=None): try: meta = { "version_name": None, "version_code": None, "min_sdk_version": None, "package": None, "md5": md5(body).hexdigest() } try: pkg = apk.APK(body, raw=True) meta["version_name"] = pkg.get_androidversion_name() meta["min_sdk_version"] = int(pkg.get_min_sdk_version()) meta["version_code"] = int(pkg.get_androidversion_code()) meta["package"] = pkg.get_package() meta["valid"] = 1 except: meta["valid"] = 0 conn = S3Connection() bucket = conn.get_bucket(bucket_name) if filename: app_key = filename else: if meta['valid']: app_key = "upload/%(package)s__%(version_code)d.apk" % meta else: app_key = "upload/novalid/%s.apk" % meta['md5'] key = bucket.new_key(app_key) for k, v in meta.iteritems(): if v: key.set_metadata(k, v) key.set_contents_from_string(body) key.close() except socket.error, e: current.retry(exc=e)
def process_resource_change(sender, instance_id, *args, **kwargs): model = None name = None # Previous method signature. if inspect.isclass(sender): model = sender else: model = TYPES[sender] name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower()) # We may run into a race condition where this task executes before the # transaction that creates the Group has committed. try: instance = model.objects.get(id=instance_id) except model.DoesNotExist as e: # Explicitly requeue the task, so we don't report this to Sentry until # we hit the max number of retries. return current.retry(exc=e) action = u'{}.created'.format(name) if action not in ALLOWED_ACTIONS: return project = None if isinstance(instance, Group): project = instance.project if not project: return servicehooks = ServiceHook.objects.filter(project_id=project.id, ) for servicehook in filter(lambda s: action in s.events, servicehooks): # For now, these ``post_save`` callbacks are only valid for service # hooks created by a Sentry App. if not servicehook.created_by_sentry_app: continue payload = app_platform_event( action, SentryAppInstallation.objects.get(id=servicehook.actor_id), serialize(instance), ) send_request(servicehook, payload, verify_ssl=True)
def create_and_upload_archive(self, src_url, key): """ A celery task that downloads an archive if it exists from a src location and attempts to upload the archive to a supported bucket in each supported region. Throughout this process, update the state of the task and finally return the location of the s3 urls if successful. expires after 30m if the task hasn't been picked up from the message queue task is killed if exceeds time_limit of an hour after it has started """ status = "Task completed! Check 's3_urls' for upload locations." s3_urls = {} buckets = current_app.config['ARCHIVER_S3_BUCKETS'] log.info('Key to be uploaded to S3: %s - Verifying src_url: %s', key, src_url) resp = requests.head(src_url) if resp.status_code == 200: try: s3_urls = upload_url_archive_to_s3(key, src_url, buckets) except Exception as exc: # set a jitter enabled delay # where an aggressive delay would result in: 7s, 49s, and 343s # and a gentle delay would result in: 4s, 16s, and 64s delay = randint(4, 7) ** (current.request.retries + 1) # retries == 0 on first attempt current.retry(exc=exc, countdown=delay) else: status = "Url not found. Does it exist? url: '{}', response: '{}' ".format(src_url, resp.status_code) log.warning(status) return { 'status': status, 'src_url': src_url, 's3_urls': s3_urls, }
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info('[BTMClassification] Got sample %d for classification.' % sample_id) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id) current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def sync_with_changelog(): """Syncronize with pypi changelog. Right now we only listen for `new-release`, `remove`, `rename`, and `create` as we do not store any metadata information. Following actions can be issued according to pypi source code: new release - Creates a new Release remove - Removes a Package from the Shop rename from %(old)s - Rename a package add %(pyversion)s %(filename)s - Add a new file to a version remove file %(filename)s - Remove a file docupdate - Notify for documentation update create - Create a new package update %(type)s - Update some detailed classifiers """ next_last_sync = timezone.now() state, created = SyncState.objects.get_or_create(type=SyncState.CHANGELOG) epoch = int(time.mktime(state.last_sync.timetuple())) client = CheeseShop() try: log = client.get_changelog(epoch, True) except socket.error as exc: if current.iterations == current.max_retries: SyncState.objects.filter(type=SyncState.CHANGELOG) \ .update(state=SyncState.STATE_DOWN) logger.warning('No sync with PyPi, it\'s not reachable.') return else: current.iterations += 1 current.retry(countdown=0, exc=exc) else: projects = set() for package, version, stamp, action in log: if action == 'new release': try: pkg = Package.objects.get(name=package) except Package.DoesNotExist: pkg = Package.create_with_provider_url(package) dt = datetime.datetime.fromtimestamp(stamp) release_date = timezone.make_aware(dt, pytz.UTC) exists = PackageVersion.objects.filter(package=pkg, version=version).exists() if not exists: update = PackageVersion(version=version, release_date=release_date) pkg.versions.add(update) ProjectDependency.objects.filter(package=pkg) \ .update(update=update) projects.update(Project.objects.filter(dependencies__package=pkg) .values_list('id', flat=True)) elif action == 'remove': # We only clear versions and set the recent updated version # on every project dependency to NULL. This way we can ensure # stability on ProjectDependency. try: pkg = Package.objects.get(name=package) ProjectDependency.objects.filter(package=pkg) \ .update(update=None) if version is None: pkg.versions.all().delete() log_affected_projects(pkg, action='remove_package', type='package', package=pkg) except Package.DoesNotExist: pass elif action == 'create': if not Package.objects.filter(name=package).exists(): Package.create_with_provider_url(package) for project in projects: sync_project.apply(args=(project,)) SyncState.objects.filter(type=SyncState.CHANGELOG) \ .update(last_sync=next_last_sync, state=SyncState.STATE_RUNNING)
def assemble_download( data_export_id, export_limit=EXPORTED_ROWS_LIMIT, batch_size=SNUBA_MAX_RESULTS, offset=0, bytes_written=0, environment_id=None, **kwargs ): with sentry_sdk.start_transaction( op="task.data_export.assemble", name="DataExportAssemble", sampled=True, ): first_page = offset == 0 try: if first_page: logger.info("dataexport.start", extra={"data_export_id": data_export_id}) data_export = ExportedData.objects.get(id=data_export_id) if first_page: metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0) logger.info( "dataexport.run", extra={"data_export_id": data_export_id, "offset": offset} ) except ExportedData.DoesNotExist as error: if first_page: metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0) logger.exception(error) return with sentry_sdk.configure_scope() as scope: if data_export.user: user = {} if data_export.user.id: user["id"] = data_export.user.id if data_export.user.username: user["username"] = data_export.user.username if data_export.user.email: user["email"] = data_export.user.email scope.user = user scope.set_tag("organization.slug", data_export.organization.slug) scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type)) scope.set_extra("export.query", data_export.query_info) try: # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT if export_limit is None: export_limit = EXPORTED_ROWS_LIMIT else: export_limit = min(export_limit, EXPORTED_ROWS_LIMIT) processor = get_processor(data_export, environment_id) with tempfile.TemporaryFile(mode="w+b") as tf: # XXX(python3): # # In python2 land we write utf-8 encoded strings as bytes via # the csv writer (see convert_to_utf8). The CSV writer will # ONLY write bytes, even if you give it unicode it will convert # it to bytes. # # In python3 we write unicode strings (which is all the csv # module is able to do, it will NOT write bytes like in py2). # Because of this we use the codec getwriter to transform our # file handle to a stream writer that will encode to utf8. if six.PY2: tfw = tf else: tfw = codecs.getwriter("utf-8")(tf) writer = csv.DictWriter(tfw, processor.header_fields, extrasaction="ignore") if first_page: writer.writeheader() # the position in the file at the end of the headers starting_pos = tf.tell() # the row offset relative to the start of the current task # this offset tells you the number of rows written during this batch fragment fragment_offset = 0 # the absolute row offset from the beginning of the export next_offset = offset + fragment_offset while True: # the number of rows to export in the next batch fragment fragment_row_count = min(batch_size, max(export_limit - next_offset, 1)) rows = process_rows(processor, data_export, fragment_row_count, next_offset) writer.writerows(rows) fragment_offset += len(rows) next_offset = offset + fragment_offset if ( not rows or len(rows) < batch_size # the batch may exceed MAX_BATCH_SIZE but immediately stops or tf.tell() - starting_pos >= MAX_BATCH_SIZE ): break tf.seek(0) new_bytes_written = store_export_chunk_as_blob(data_export, bytes_written, tf) bytes_written += new_bytes_written except ExportError as error: return data_export.email_failure(message=six.text_type(error)) except Exception as error: metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0) logger.error( "dataexport.error: %s", six.text_type(error), extra={"query": data_export.payload, "org": data_export.organization_id}, ) capture_exception(error) try: current.retry() except MaxRetriesExceededError: metrics.incr( "dataexport.end", tags={"success": False, "error": six.text_type(error)}, sample_rate=1.0, ) return data_export.email_failure(message="Internal processing failure") else: if ( rows and len(rows) >= batch_size and new_bytes_written and next_offset < export_limit ): assemble_download.delay( data_export_id, export_limit=export_limit, batch_size=batch_size, offset=next_offset, bytes_written=bytes_written, environment_id=environment_id, ) else: metrics.timing("dataexport.row_count", next_offset, sample_rate=1.0) metrics.timing("dataexport.file_size", bytes_written, sample_rate=1.0) merge_export_blobs.delay(data_export_id)
def wrapped(*args, **kwargs): try: return func(*args, **kwargs) except Exception as exc: current.retry(exc=exc)
def assemble_download(data_export_id, export_limit=EXPORTED_ROWS_LIMIT, batch_size=SNUBA_MAX_RESULTS, offset=0, bytes_written=0, environment_id=None, **kwargs): first_page = offset == 0 try: if first_page: logger.info("dataexport.start", extra={"data_export_id": data_export_id}) data_export = ExportedData.objects.get(id=data_export_id) if first_page: metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0) logger.info("dataexport.run", extra={ "data_export_id": data_export_id, "offset": offset }) except ExportedData.DoesNotExist as error: if first_page: metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0) logger.exception(error) return try: if export_limit is None: export_limit = EXPORTED_ROWS_LIMIT else: export_limit = min(export_limit, EXPORTED_ROWS_LIMIT) # if there is an export limit, the last batch should only return up to the export limit if export_limit is not None: batch_size = min(batch_size, max(export_limit - offset, 0)) processor = get_processor(data_export, environment_id) with tempfile.TemporaryFile() as tf: writer = csv.DictWriter(tf, processor.header_fields, extrasaction="ignore") if first_page: writer.writeheader() rows = process_rows(processor, data_export, batch_size, offset) writer.writerows(rows) next_offset = offset + len(rows) tf.seek(0) new_bytes_written = store_export_chunk_as_blob( data_export, bytes_written, tf) bytes_written += new_bytes_written except ExportError as error: return data_export.email_failure(message=six.text_type(error)) except Exception as error: metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0) logger.error( "dataexport.error: %s", six.text_type(error), extra={ "query": data_export.payload, "org": data_export.organization_id }, ) capture_exception(error) try: current.retry() except MaxRetriesExceededError: return data_export.email_failure( message="Internal processing failure") else: if (rows and len(rows) >= batch_size and new_bytes_written and (export_limit is None or next_offset < export_limit)): assemble_download.delay( data_export_id, export_limit=export_limit, batch_size=batch_size, offset=next_offset, bytes_written=bytes_written, environment_id=environment_id, ) else: merge_export_blobs.delay(data_export_id)
sample_url=sample.url, job_id=sample.job_id, ) except subprocess.CalledProcessError, e: # Something wrong has happened to links. Couldn't find documentation on # error codes - assume bad stuff has happened that retrying won't fix. send_event( 'EventSampleContentFail', sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, error_code=e.returncode ) return False except DatabaseError, e: current.retry(exc=e, countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) return text != '' @task() def web_screenshot_extraction(sample_id, url=None, *args, **kwargs): """ Generates html output from those browsers. """ if url is None: url = Sample.objects.get(id=sample_id).url if not is_proper_url(url): return False sample = Sample.objects.get(id=sample_id)
def assemble_download(data_export_id, export_limit=EXPORTED_ROWS_LIMIT, batch_size=SNUBA_MAX_RESULTS, offset=0, bytes_written=0, environment_id=None, **kwargs): first_page = offset == 0 try: if first_page: logger.info("dataexport.start", extra={"data_export_id": data_export_id}) data_export = ExportedData.objects.get(id=data_export_id) if first_page: metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0) logger.info("dataexport.run", extra={ "data_export_id": data_export_id, "offset": offset }) except ExportedData.DoesNotExist as error: if first_page: metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0) logger.exception(error) return try: # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT if export_limit is None: export_limit = EXPORTED_ROWS_LIMIT else: export_limit = min(export_limit, EXPORTED_ROWS_LIMIT) processor = get_processor(data_export, environment_id) with tempfile.TemporaryFile() as tf: writer = csv.DictWriter(tf, processor.header_fields, extrasaction="ignore") if first_page: writer.writeheader() # the position in the file at the end of the headers starting_pos = tf.tell() # the row offset relative to the start of the current task # this offset tells you the number of rows written during this batch fragment fragment_offset = 0 # the absolute row offset from the beginning of the export next_offset = offset + fragment_offset while True: # the number of rows to export in the next batch fragment fragment_row_count = min(batch_size, max(export_limit - next_offset, 1)) rows = process_rows(processor, data_export, fragment_row_count, next_offset) writer.writerows(rows) fragment_offset += len(rows) next_offset = offset + fragment_offset if (not rows or len(rows) < batch_size # the batch may exceed MAX_BATCH_SIZE but immediately stops or tf.tell() - starting_pos >= MAX_BATCH_SIZE): break tf.seek(0) new_bytes_written = store_export_chunk_as_blob( data_export, bytes_written, tf) bytes_written += new_bytes_written except ExportError as error: return data_export.email_failure(message=six.text_type(error)) except Exception as error: metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0) logger.error( "dataexport.error: %s", six.text_type(error), extra={ "query": data_export.payload, "org": data_export.organization_id }, ) capture_exception(error) try: current.retry() except MaxRetriesExceededError: metrics.incr( "dataexport.end", tags={ "success": False, "error": six.text_type(error) }, sample_rate=1.0, ) return data_export.email_failure( message="Internal processing failure") else: if rows and len( rows ) >= batch_size and new_bytes_written and next_offset < export_limit: assemble_download.delay( data_export_id, export_limit=export_limit, batch_size=batch_size, offset=next_offset, bytes_written=bytes_written, environment_id=environment_id, ) else: metrics.timing("dataexport.row_count", next_offset) metrics.timing("dataexport.file_size", bytes_written) merge_export_blobs.delay(data_export_id)