Пример #1
0
def create_and_upload_archive(self, src_url, key):
    """
    A celery task that downloads an archive if it exists from a src location and attempts to upload
    the archive to a supported bucket in each supported region.

    Throughout this process, update the state of the task and finally return the location of the
    s3 urls if successful.

    expires after 30m if the task hasn't been picked up from the message queue

    task is killed if exceeds time_limit of an hour after it has started
    """
    status = ""
    s3_urls = {}
    buckets = current_app.config['ARCHIVER_S3_BUCKETS']

    try:
        s3_urls, status = upload_url_archive_to_s3(key, src_url, buckets)
    except Exception as exc:
        # set a jitter enabled delay
        # where an aggressive delay would result in: 7s, 49s, and 343s
        # and a gentle delay would result in: 4s, 16s, and 64s
        delay = randint(4, 7) ** (current.request.retries + 1)  # retries == 0 on first attempt
        current.retry(exc=exc, countdown=delay)

    return {
        'status': status,
        'src_url': src_url,
        's3_urls': s3_urls,
    }
Пример #2
0
def update_associated_submissions(database_name, form_model_id, deleted_question_codes):
    try:
        manager = get_db_manager(database_name)
        #update_submissions_for_form_code_change(manager, new_form_code, old_form_code)
        remove_deleted_questions_from_submissions(manager, form_model_id, deleted_question_codes)
    except Exception as e:
        current.retry(exc=e)
Пример #3
0
def create_and_upload_archive(self, src_url, key):
    """
    A celery task that downloads an archive if it exists from a src location and attempts to upload
    the archive to a supported bucket in each supported region.

    Throughout this process, update the state of the task and finally return the location of the
    s3 urls if successful.

    expires after 30m if the task hasn't been picked up from the message queue

    task is killed if exceeds time_limit of an hour after it has started
    """
    status = ""
    s3_urls = {}
    buckets = current_app.config['ARCHIVER_S3_BUCKETS']

    try:
        s3_urls, status = upload_url_archive_to_s3(key, src_url, buckets)
    except Exception as exc:
        # set a jitter enabled delay
        # where an aggressive delay would result in: 7s, 49s, and 343s
        # and a gentle delay would result in: 4s, 16s, and 64s
        delay = randint(4, 7)**(current.request.retries + 1
                                )  # retries == 0 on first attempt
        current.retry(exc=exc, countdown=delay)

    return {
        'status': status,
        'src_url': src_url,
        's3_urls': s3_urls,
    }
Пример #4
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' % class_sample.id
        )
        current.retry(
            countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
def update_associated_submissions(database_name, old_form_code, new_form_code, deleted_question_codes):
    try:
        manager = get_db_manager(database_name)
        update_submissions_for_form_code_change(manager, new_form_code, old_form_code)
        update_submissions_for_form_field_change(manager, old_form_code, deleted_question_codes)
    except Exception as e:
        current.retry(exc=e)
Пример #6
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' %
            class_sample.id)
        current.retry(
            countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
Пример #7
0
 def wrapped(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except exclude:
         raise
     except on as exc:
         capture_exception()
         current.retry(exc=exc)
Пример #8
0
 def wrapped(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except exclude:
         raise
     except on as exc:
         Raven.captureException()
         current.retry(exc=exc)
Пример #9
0
def download(url, filesize = None, localFileName = None):
    try:
        global dstdir

        if localFileName != None:
            localName = localFileName
        else:
            localName = join(dstdir, urlsplit(url).path[1:])

        if splitext(localName)[1].lower() in exclude_exts:
            logger.info("exclude file: %s, skip", localName) 
	    return True 

        req = urllib2.Request(url)
        r = urllib2.urlopen(req)
        file_len = int(r.headers["Content-Length"])

        if filesize is not None and filesize != file_len:
            logger.error("filesize(%s) != file_len(%s): %s", filesize, file_len, url)
            if file_len == 0:
                raise Exception("Get file_len=0 from nginx, retry again ")

        filesize = file_len

        if os.path.exists(localName) and splitext(localName)[1].lower() not in special_exts:
            if getsize(localName) == filesize:
                logger.info("File \'%s\' existed and filesize(%s) equals, skip", url, filesize)
                r.close()
	        return True
            else:
                logger.error("File \'%s\' existed, but file_len(%s) != local_filesize(%s), redownload", url, filesize, getsize(localName))

        dstdirname = dirname(localName)
        if not os.path.exists(dstdirname):
            os.makedirs(dstdirname)
            
        block_sz = 8192*2
        with open(localName, 'wb') as f:
            while True:
                buffer = r.read(block_sz)
                if not buffer:
                    break
                f.write(buffer)

        r.close()
        sz = getsize(localName)
        if sz != filesize:
            logger.error("download %s unfinished: filesz:%s != localfilesz:%s" % (url, filesize, sz))
            raise Exception("download %s unfinished: filesz:%s != localfilesz:%s" % (url, filesize, sz))
        logger.info("down: %s to %s, filesize=%s, OK", url, localName, sz)

        return True 
    except Exception, exc:
        logger.info("down: %s to %s failed: %s", url, localName, exc)
        if isinstance(exc, urllib2.HTTPError) and exc.code == 404:
            return True
        current.retry(exc=exc, countdown=min(2 ** current.request.retries, 360))
Пример #10
0
def update_single_search_index_item(full_path):
    try:
        return do_update(full_path)
    except Exception, exc:
        print 'Update search index got exception'
        print exc
        traceback.print_exc(exc)
        # exponential retry backoff, in seconds: 1, 2, 4, 16, 32, 64, 128
        current.retry(exc=exc, countdown=min(2 ** current.request.retries, 128))
Пример #11
0
def cache_thumbnail(id, upload_to_s3=False, marker=True):
    try:
        import shutil
        print settings.MAPBOX_ACCESS_TOKEN
        from vida.firestation.models import FireDepartment
        department = FireDepartment.objects.get(id=id)

        filename = department.thumbnail_name
        generate_thumbnail = department.generate_thumbnail(marker=marker)

        if not marker:
            filename = department.thumbnail_name_no_marker

        full_filename = os.path.join('/home/vida/department-thumbnails',
                                     filename)

        if not generate_thumbnail.startswith('/static'):
            f = download_file(generate_thumbnail,
                              full_filename.replace('jpg', 'png'))
            full_filename = convert_png_to_jpg(f)
        else:
            shutil.copy(
                '/webapps/vida/vida/vida/firestation/static/firestation/theme/assets/images/content/property-1.jpg',
                full_filename)

        if upload_to_s3:
            c = boto.s3.connect_to_region(
                'us-east-1',
                aws_access_key_id=getattr(settings, 'AWS_ACCESS_KEY_ID', None),
                aws_secret_access_key=getattr(settings,
                                              'AWS_SECRET_ACCESS_KEY', None),
                is_secure=True,
                calling_format=boto.s3.connection.OrdinaryCallingFormat(),
                debug=2)

            b = c.get_bucket('vida-static/department-thumbnails',
                             validate=False)
            mtype = mimetypes.guess_type(
                filename)[0] or 'application/octet-stream'
            headers = {
                'Content-Type': mtype,
                'Cache-Control': 'max-age=%d, public' % (3600 * 24)
            }
            singlepart_upload(b,
                              key_name=filename,
                              fullpath=full_filename,
                              policy='public-read',
                              reduced_redundancy=False,
                              headers=headers)

    except Exception as exc:
        if current.request.retries < 3:
            current.retry(exc=exc,
                          countdown=min(2**current.request.retries, 128))
Пример #12
0
def async_populate_submission_index(db_name, form_code):
    try:
        try:
            dbm = get_db_manager(db_name)
            from datawinners.search.manage_index import populate_submission_index

            populate_submission_index(dbm, form_code)
        except Exception as e:
            current.retry(exc=e)
    except Exception as e:
        logger = logging.getLogger('tasks')
        logger.exception('Failed for db: %s ,form code: %s' %
                         (db_name, form_code))
        logger.exception(e)
Пример #13
0
def _populate_submission_index(db_name, form_model_id):
    logger = logging.getLogger('datawinners.tasks')
    try:
        try:
            dbm = get_db_manager(db_name)
            from datawinners.search.manage_index import populate_submission_index

            populate_submission_index(dbm, form_model_id)
            _clear_index_cache(dbm)
        except Exception as e:
            current.retry(exc=e)
    except Exception as e:
        logger.exception('Failed for db: %s ,form model id: %s' % (db_name, form_model_id))
        logger.exception(e)
Пример #14
0
    def _retry(self, exc=None, max_time=(60 * 60 * 3), max_countdown=(60 * 60), kwargs=None):

        request = getattr(current, 'request', None)
        if request is None:
            warning('would retry, but it is not inside running task context')
            return celery.exceptions.Retry('dummy retry after: %s' % exc)

        countdown = min(2 ** current.request.retries, max_countdown)
        max_retries = 0
        time_counter = 0
        while time_counter < max_time:
            time_counter += min(2 ** max_retries, max_countdown)
            max_retries += 1

        args_new = current.request.args

        if args_new is None:
            args_new = []

        kwargs_new = current.request.kwargs
        if kwargs_new is None:
            kwargs_new = {}
        if kwargs:
            kwargs_new.update(kwargs)

        warning('retrying %s(%s)', current, ', '.join(list(map(repr, args_new)) + ['%s=%r' % (k, v) for k, v in kwargs_new.items()]))

        return current.retry(exc=exc, countdown=countdown, max_retries=max_retries, args=args_new, kwargs=kwargs_new)
Пример #15
0
def check_instance_status(kwargs):
    nova = Client(kwargs['version'], kwargs['username'], kwargs['password'],
                  kwargs['project'], kwargs['endpoint'])
    instance = nova.servers.get(kwargs['instance_id'])

    if instance.status == "BUILD":
        try:
            raise Exception("Still building")
        except Exception as e:
            interval = min(10 * (2 ** current.request.retries), 1800)
            raise current.retry(args=[kwargs], exc=e,
                                countdown=interval, max_retries=8)
    else:
        kwargs['instance_status'] = instance.status

        record = Record.query.\
            filter_by(instance_id=kwargs['instance_id']).first()
        record.instance_status = kwargs['instance_status']
        try:
            db.session.commit()
            if kwargs['email_addr']:
                send_mail(kwargs['email_addr'],
                          task_id=record.task_id,
                          instance_id=record.instance_id,
                          instance_status=record.instance_status,
                          memo=record.memo)
        except:
            db.session.rollback()
            raise

        return kwargs
Пример #16
0
def process_resource_change(action, sender, instance_id, *args, **kwargs):
    # The class is serialized as a string when enqueueing the class.
    model = TYPES[sender]

    # Some resources are named differently than their model. eg. Group vs
    # Issue. Looks up the human name for the model. Defaults to the model name.
    name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower())

    # We may run into a race condition where this task executes before the
    # transaction that creates the Group has committed.
    try:
        instance = model.objects.get(id=instance_id)
    except model.DoesNotExist as e:
        # Explicitly requeue the task, so we don't report this to Sentry until
        # we hit the max number of retries.
        return current.retry(exc=e)

    event = '{}.{}'.format(name, action)

    if event not in VALID_EVENTS:
        return

    org = None

    if isinstance(instance, Group):
        org = instance.organization

    installations = filter(
        lambda i: event in i.sentry_app.events,
        org.sentry_app_installations.select_related('sentry_app'),
    )

    for installation in installations:
        send_webhooks(installation, event, data=serialize(instance))
Пример #17
0
def retry_task_noargs(**kwargs):
    current.iterations += 1

    retries = kwargs["task_retries"]
    if retries >= 3:
        return 42
    else:
        return current.retry(countdown=0)
Пример #18
0
def retry_task_noargs(**kwargs):
    current.iterations += 1

    retries = kwargs['task_retries']
    if retries >= 3:
        return 42
    else:
        raise current.retry(countdown=0)
Пример #19
0
def retry_task(arg1, arg2, kwarg=1, max_retries=None, care=True):
    current.iterations += 1
    rmax = current.max_retries if max_retries is None else max_retries

    retries = current.request.retries
    if care and retries >= rmax:
        return arg1
    else:
        return current.retry(countdown=0, max_retries=rmax)
Пример #20
0
def process_resource_change(action, sender, instance_id, *args, **kwargs):
    model = None
    name = None

    # Previous method signature.
    if inspect.isclass(sender):
        model = sender
    else:
        model = TYPES[sender]

    name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower())

    # We may run into a race condition where this task executes before the
    # transaction that creates the Group has committed.
    try:
        instance = model.objects.get(id=instance_id)
    except model.DoesNotExist as e:
        # Explicitly requeue the task, so we don't report this to Sentry until
        # we hit the max number of retries.
        return current.retry(exc=e)

    event = '{}.{}'.format(name, action)

    if event not in ALLOWED_EVENTS:
        return

    project = None

    if isinstance(instance, Group):
        project = instance.project

    if not project:
        return

    servicehooks = ServiceHook.objects.filter(
        project_id=project.id,
    )

    for servicehook in filter(lambda s: event in s.events, servicehooks):
        # For now, these ``post_save`` callbacks are only valid for service
        # hooks created by a Sentry App.
        if not servicehook.created_by_sentry_app:
            continue

        request_data = AppPlatformEvent(
            resource=name,
            action=action,
            install=SentryAppInstallation.objects.get(id=servicehook.actor_id),
            data=serialize(instance),
        )

        safe_urlopen(
            url=servicehook.url,
            data=request_data.body,
            headers=request_data.headers,
            timeout=5,
        )
Пример #21
0
def retry_task_mockapply(arg1, arg2, kwarg=1, **kwargs):
    current.iterations += 1

    retries = kwargs['task_retries']
    if retries >= 3:
        return arg1
    else:
        kwargs.update(kwarg=kwarg)
    raise current.retry(countdown=0)
Пример #22
0
def retry_task_mockapply(arg1, arg2, kwarg=1, **kwargs):
    current.iterations += 1

    retries = kwargs["task_retries"]
    if retries >= 3:
        return arg1
    else:
        kwargs.update(kwarg=kwarg)
    return current.retry(countdown=0)
Пример #23
0
def retry_task(arg1, arg2, kwarg=1, max_retries=None, care=True):
    current.iterations += 1
    rmax = current.max_retries if max_retries is None else max_retries

    retries = current.request.retries
    if care and retries >= rmax:
        return arg1
    else:
        return current.retry(countdown=0, max_retries=rmax)
Пример #24
0
def cache_thumbnail(id, upload_to_s3=False, marker=True):
    try:
        import shutil
        print settings.MAPBOX_ACCESS_TOKEN
        from firecares.firestation.models import FireDepartment
        department = FireDepartment.objects.get(id=id)

        filename = department.thumbnail_name
        generate_thumbnail = department.generate_thumbnail(marker=marker)

        if not marker:
            filename = department.thumbnail_name_no_marker

        full_filename = os.path.join('/home/firecares/department-thumbnails', filename)

        if not generate_thumbnail.startswith('/static'):
            f = download_file(generate_thumbnail, full_filename.replace('jpg', 'png'))
            full_filename = convert_png_to_jpg(f)
        else:
            shutil.copy('/webapps/firecares/firecares/firecares/firestation/static/firestation/theme/assets/images/content/property-1.jpg', full_filename)

        if upload_to_s3:
            c = boto.s3.connect_to_region('us-east-1',
                                          aws_access_key_id=getattr(settings, 'AWS_ACCESS_KEY_ID', None),
                                          aws_secret_access_key=getattr(settings, 'AWS_SECRET_ACCESS_KEY', None),
                                          is_secure=True,
                                          calling_format=boto.s3.connection.OrdinaryCallingFormat(),
                                          debug=2
                                          )

            b = c.get_bucket('firecares-static/department-thumbnails', validate=False)
            mtype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
            headers = {'Content-Type': mtype, 'Cache-Control': 'max-age=%d, public' % (3600 * 24)}
            singlepart_upload(b,
                              key_name=filename,
                              fullpath=full_filename,
                              policy='public-read',
                              reduced_redundancy=False,
                              headers=headers)

    except Exception as exc:
        if current.request.retries < 3:
            current.retry(exc=exc, countdown=min(2 ** current.request.retries, 128))
Пример #25
0
def process_resource_change(action, sender, instance_id, *args, **kwargs):
    model = None
    name = None

    # Previous method signature.
    if inspect.isclass(sender):
        model = sender
    else:
        model = TYPES[sender]

    name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower())

    # We may run into a race condition where this task executes before the
    # transaction that creates the Group has committed.
    try:
        instance = model.objects.get(id=instance_id)
    except model.DoesNotExist as e:
        # Explicitly requeue the task, so we don't report this to Sentry until
        # we hit the max number of retries.
        return current.retry(exc=e)

    event = '{}.{}'.format(name, action)

    if event not in ALLOWED_EVENTS:
        return

    project = None

    if isinstance(instance, Group):
        project = instance.project

    if not project:
        return

    servicehooks = ServiceHook.objects.filter(project_id=project.id, )

    for servicehook in filter(lambda s: event in s.events, servicehooks):
        # For now, these ``post_save`` callbacks are only valid for service
        # hooks created by a Sentry App.
        if not servicehook.created_by_sentry_app:
            continue

        request_data = AppPlatformEvent(
            resource=name,
            action=action,
            install=SentryAppInstallation.objects.get(id=servicehook.actor_id),
            data=serialize(instance),
        )

        safe_urlopen(
            url=servicehook.url,
            data=request_data.body,
            headers=request_data.headers,
            timeout=5,
        )
Пример #26
0
def retry_task_customexc(arg1, arg2, kwarg=1, **kwargs):
    current.iterations += 1

    retries = kwargs["task_retries"]
    if retries >= 3:
        return arg1 + kwarg
    else:
        try:
            raise MyCustomException("Elaine Marie Benes")
        except MyCustomException, exc:
            kwargs.update(kwarg=kwarg)
            return current.retry(countdown=0, exc=exc)
Пример #27
0
def retry_task_customexc(arg1, arg2, kwarg=1, **kwargs):
    current.iterations += 1

    retries = kwargs['task_retries']
    if retries >= 3:
        return arg1 + kwarg
    else:
        try:
            raise MyCustomException('Elaine Marie Benes')
        except MyCustomException as exc:
            kwargs.update(kwarg=kwarg)
            raise current.retry(countdown=0, exc=exc)
Пример #28
0
Файл: tasks.py Проект: oii/ogre
def index_for_search(ebook_id=None, ebook_data=None):
    """
    Add ebook to the Whoosh search index
    """
    with app.app_context():
        if ebook_id and not ebook_data:
            ds = DataStore(app.config, app.logger)
            ebook_data = ds.load_ebook(ebook_id)
        elif not ebook_data:
            raise Exception('index_for_search task called without ebook_id or ebook_data params')

        try:
            # create search class and index
            search = Search(init_whoosh(app), pagelen=app.config.get('SEARCH_PAGELEN', 20))
            search.index_for_search(ebook_data)

        except whoosh.writing.LockError:
            # if index is unavailable try again in 10 secs
            current.retry(
                kwargs={'ebook_id': ebook_id, 'ebook_data': ebook_data},
                countdown=1,
            )
Пример #29
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info(
        '[BTMClassification] Got sample %d for classification.' % sample_id
    )
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.'
                % btm_sample.id
        )
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Пример #30
0
def upload(bucket_name='androidpackage', filename=None, body=None):
    try:
        meta = {
            "version_name": None,
            "version_code": None,
            "min_sdk_version": None,
            "package": None,
            "md5": md5(body).hexdigest()
        }
        try:
            pkg = apk.APK(body, raw=True)
            meta["version_name"] = pkg.get_androidversion_name()
            meta["min_sdk_version"] = int(pkg.get_min_sdk_version())
            meta["version_code"] = int(pkg.get_androidversion_code())
            meta["package"] = pkg.get_package()
            meta["valid"] = 1
        except:
            meta["valid"] = 0

        conn = S3Connection()
        bucket = conn.get_bucket(bucket_name)
        if filename:
            app_key = filename
        else:
            if meta['valid']:
                app_key = "upload/%(package)s__%(version_code)d.apk" % meta
            else:
                app_key = "upload/novalid/%s.apk" % meta['md5']

        key = bucket.new_key(app_key)
        for k, v in meta.iteritems():
            if v:
                key.set_metadata(k, v)
        key.set_contents_from_string(body)
        key.close()
    except socket.error, e:
        current.retry(exc=e)
Пример #31
0
def process_resource_change(sender, instance_id, *args, **kwargs):
    model = None
    name = None

    # Previous method signature.
    if inspect.isclass(sender):
        model = sender
    else:
        model = TYPES[sender]

    name = RESOURCE_RENAMES.get(model.__name__, model.__name__.lower())

    # We may run into a race condition where this task executes before the
    # transaction that creates the Group has committed.
    try:
        instance = model.objects.get(id=instance_id)
    except model.DoesNotExist as e:
        # Explicitly requeue the task, so we don't report this to Sentry until
        # we hit the max number of retries.
        return current.retry(exc=e)

    action = u'{}.created'.format(name)

    if action not in ALLOWED_ACTIONS:
        return

    project = None

    if isinstance(instance, Group):
        project = instance.project

    if not project:
        return

    servicehooks = ServiceHook.objects.filter(project_id=project.id, )

    for servicehook in filter(lambda s: action in s.events, servicehooks):
        # For now, these ``post_save`` callbacks are only valid for service
        # hooks created by a Sentry App.
        if not servicehook.created_by_sentry_app:
            continue

        payload = app_platform_event(
            action,
            SentryAppInstallation.objects.get(id=servicehook.actor_id),
            serialize(instance),
        )

        send_request(servicehook, payload, verify_ssl=True)
Пример #32
0
def create_and_upload_archive(self, src_url, key):
    """
    A celery task that downloads an archive if it exists from a src location and attempts to upload
    the archive to a supported bucket in each supported region.

    Throughout this process, update the state of the task and finally return the location of the
    s3 urls if successful.

    expires after 30m if the task hasn't been picked up from the message queue

    task is killed if exceeds time_limit of an hour after it has started
    """
    status = "Task completed! Check 's3_urls' for upload locations."
    s3_urls = {}
    buckets = current_app.config['ARCHIVER_S3_BUCKETS']

    log.info('Key to be uploaded to S3: %s - Verifying src_url: %s', key, src_url)
    resp = requests.head(src_url)
    if resp.status_code == 200:
        try:
            s3_urls = upload_url_archive_to_s3(key, src_url, buckets)
        except Exception as exc:
            # set a jitter enabled delay
            # where an aggressive delay would result in: 7s, 49s, and 343s
            # and a gentle delay would result in: 4s, 16s, and 64s
            delay = randint(4, 7) ** (current.request.retries + 1)  # retries == 0 on first attempt
            current.retry(exc=exc, countdown=delay)
    else:
        status = "Url not found. Does it exist? url: '{}', response: '{}' ".format(src_url,
                                                                                   resp.status_code)
        log.warning(status)
    return {
        'status': status,
        'src_url': src_url,
        's3_urls': s3_urls,
    }
Пример #33
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info('[BTMClassification] Got sample %d for classification.' %
             sample_id)
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.' %
            btm_sample.id)
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Пример #34
0
def sync_with_changelog():
    """Syncronize with pypi changelog.

    Right now we only listen for `new-release`, `remove`, `rename`,
    and `create` as we do not store any metadata information.

    Following actions can be issued according to pypi source code:
        new release			- Creates a new Release
        remove				- Removes a Package from the Shop
        rename from %(old)s		- Rename a package
        add %(pyversion)s %(filename)s  - Add a new file to a version
        remove file %(filename)s        - Remove a file
        docupdate                       - Notify for documentation update
        create				- Create a new package
        update %(type)s                 - Update some detailed classifiers
    """
    next_last_sync = timezone.now()

    state, created = SyncState.objects.get_or_create(type=SyncState.CHANGELOG)

    epoch = int(time.mktime(state.last_sync.timetuple()))

    client = CheeseShop()

    try:
        log = client.get_changelog(epoch, True)
    except socket.error as exc:
        if current.iterations == current.max_retries:
            SyncState.objects.filter(type=SyncState.CHANGELOG) \
                             .update(state=SyncState.STATE_DOWN)
            logger.warning('No sync with PyPi, it\'s not reachable.')
            return
        else:
            current.iterations += 1
            current.retry(countdown=0, exc=exc)
    else:
        projects = set()
        for package, version, stamp, action in log:
            if action == 'new release':
                try:
                    pkg = Package.objects.get(name=package)
                except Package.DoesNotExist:
                    pkg = Package.create_with_provider_url(package)

                dt = datetime.datetime.fromtimestamp(stamp)
                release_date = timezone.make_aware(dt, pytz.UTC)
                exists = PackageVersion.objects.filter(package=pkg,
                                                       version=version).exists()
                if not exists:
                    update = PackageVersion(version=version,
                                            release_date=release_date)
                    pkg.versions.add(update)
                    ProjectDependency.objects.filter(package=pkg) \
                                             .update(update=update)

                projects.update(Project.objects.filter(dependencies__package=pkg)
                                               .values_list('id', flat=True))

            elif action == 'remove':
                # We only clear versions and set the recent updated version
                # on every project dependency to NULL. This way we can ensure
                # stability on ProjectDependency.
                try:
                    pkg = Package.objects.get(name=package)
                    ProjectDependency.objects.filter(package=pkg) \
                                             .update(update=None)

                    if version is None:
                        pkg.versions.all().delete()

                    log_affected_projects(pkg, action='remove_package',
                                          type='package', package=pkg)
                except Package.DoesNotExist:
                    pass

            elif action == 'create':
                if not Package.objects.filter(name=package).exists():
                    Package.create_with_provider_url(package)

        for project in projects:
            sync_project.apply(args=(project,))

        SyncState.objects.filter(type=SyncState.CHANGELOG) \
                         .update(last_sync=next_last_sync,
                                 state=SyncState.STATE_RUNNING)
Пример #35
0
def assemble_download(
    data_export_id,
    export_limit=EXPORTED_ROWS_LIMIT,
    batch_size=SNUBA_MAX_RESULTS,
    offset=0,
    bytes_written=0,
    environment_id=None,
    **kwargs
):
    with sentry_sdk.start_transaction(
        op="task.data_export.assemble", name="DataExportAssemble", sampled=True,
    ):
        first_page = offset == 0

        try:
            if first_page:
                logger.info("dataexport.start", extra={"data_export_id": data_export_id})
            data_export = ExportedData.objects.get(id=data_export_id)
            if first_page:
                metrics.incr("dataexport.start", tags={"success": True}, sample_rate=1.0)
            logger.info(
                "dataexport.run", extra={"data_export_id": data_export_id, "offset": offset}
            )
        except ExportedData.DoesNotExist as error:
            if first_page:
                metrics.incr("dataexport.start", tags={"success": False}, sample_rate=1.0)
            logger.exception(error)
            return

        with sentry_sdk.configure_scope() as scope:
            if data_export.user:
                user = {}
                if data_export.user.id:
                    user["id"] = data_export.user.id
                if data_export.user.username:
                    user["username"] = data_export.user.username
                if data_export.user.email:
                    user["email"] = data_export.user.email
                scope.user = user
            scope.set_tag("organization.slug", data_export.organization.slug)
            scope.set_tag("export.type", ExportQueryType.as_str(data_export.query_type))
            scope.set_extra("export.query", data_export.query_info)

        try:
            # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT
            if export_limit is None:
                export_limit = EXPORTED_ROWS_LIMIT
            else:
                export_limit = min(export_limit, EXPORTED_ROWS_LIMIT)

            processor = get_processor(data_export, environment_id)

            with tempfile.TemporaryFile(mode="w+b") as tf:
                # XXX(python3):
                #
                # In python2 land we write utf-8 encoded strings as bytes via
                # the csv writer (see convert_to_utf8). The CSV writer will
                # ONLY write bytes, even if you give it unicode it will convert
                # it to bytes.
                #
                # In python3 we write unicode strings (which is all the csv
                # module is able to do, it will NOT write bytes like in py2).
                # Because of this we use the codec getwriter to transform our
                # file handle to a stream writer that will encode to utf8.
                if six.PY2:
                    tfw = tf
                else:
                    tfw = codecs.getwriter("utf-8")(tf)

                writer = csv.DictWriter(tfw, processor.header_fields, extrasaction="ignore")
                if first_page:
                    writer.writeheader()

                # the position in the file at the end of the headers
                starting_pos = tf.tell()

                # the row offset relative to the start of the current task
                # this offset tells you the number of rows written during this batch fragment
                fragment_offset = 0

                # the absolute row offset from the beginning of the export
                next_offset = offset + fragment_offset

                while True:
                    # the number of rows to export in the next batch fragment
                    fragment_row_count = min(batch_size, max(export_limit - next_offset, 1))

                    rows = process_rows(processor, data_export, fragment_row_count, next_offset)
                    writer.writerows(rows)

                    fragment_offset += len(rows)
                    next_offset = offset + fragment_offset

                    if (
                        not rows
                        or len(rows) < batch_size
                        # the batch may exceed MAX_BATCH_SIZE but immediately stops
                        or tf.tell() - starting_pos >= MAX_BATCH_SIZE
                    ):
                        break

                tf.seek(0)
                new_bytes_written = store_export_chunk_as_blob(data_export, bytes_written, tf)
                bytes_written += new_bytes_written
        except ExportError as error:
            return data_export.email_failure(message=six.text_type(error))
        except Exception as error:
            metrics.incr("dataexport.error", tags={"error": six.text_type(error)}, sample_rate=1.0)
            logger.error(
                "dataexport.error: %s",
                six.text_type(error),
                extra={"query": data_export.payload, "org": data_export.organization_id},
            )
            capture_exception(error)

            try:
                current.retry()
            except MaxRetriesExceededError:
                metrics.incr(
                    "dataexport.end",
                    tags={"success": False, "error": six.text_type(error)},
                    sample_rate=1.0,
                )
                return data_export.email_failure(message="Internal processing failure")
        else:
            if (
                rows
                and len(rows) >= batch_size
                and new_bytes_written
                and next_offset < export_limit
            ):
                assemble_download.delay(
                    data_export_id,
                    export_limit=export_limit,
                    batch_size=batch_size,
                    offset=next_offset,
                    bytes_written=bytes_written,
                    environment_id=environment_id,
                )
            else:
                metrics.timing("dataexport.row_count", next_offset, sample_rate=1.0)
                metrics.timing("dataexport.file_size", bytes_written, sample_rate=1.0)
                merge_export_blobs.delay(data_export_id)
Пример #36
0
 def wrapped(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except Exception as exc:
         current.retry(exc=exc)
Пример #37
0
def assemble_download(data_export_id,
                      export_limit=EXPORTED_ROWS_LIMIT,
                      batch_size=SNUBA_MAX_RESULTS,
                      offset=0,
                      bytes_written=0,
                      environment_id=None,
                      **kwargs):
    first_page = offset == 0

    try:
        if first_page:
            logger.info("dataexport.start",
                        extra={"data_export_id": data_export_id})
        data_export = ExportedData.objects.get(id=data_export_id)
        if first_page:
            metrics.incr("dataexport.start",
                         tags={"success": True},
                         sample_rate=1.0)
        logger.info("dataexport.run",
                    extra={
                        "data_export_id": data_export_id,
                        "offset": offset
                    })
    except ExportedData.DoesNotExist as error:
        if first_page:
            metrics.incr("dataexport.start",
                         tags={"success": False},
                         sample_rate=1.0)
        logger.exception(error)
        return

    try:
        if export_limit is None:
            export_limit = EXPORTED_ROWS_LIMIT
        else:
            export_limit = min(export_limit, EXPORTED_ROWS_LIMIT)

        # if there is an export limit, the last batch should only return up to the export limit
        if export_limit is not None:
            batch_size = min(batch_size, max(export_limit - offset, 0))

        processor = get_processor(data_export, environment_id)

        with tempfile.TemporaryFile() as tf:
            writer = csv.DictWriter(tf,
                                    processor.header_fields,
                                    extrasaction="ignore")
            if first_page:
                writer.writeheader()

            rows = process_rows(processor, data_export, batch_size, offset)
            writer.writerows(rows)

            next_offset = offset + len(rows)

            tf.seek(0)
            new_bytes_written = store_export_chunk_as_blob(
                data_export, bytes_written, tf)
            bytes_written += new_bytes_written
    except ExportError as error:
        return data_export.email_failure(message=six.text_type(error))
    except Exception as error:
        metrics.incr("dataexport.error",
                     tags={"error": six.text_type(error)},
                     sample_rate=1.0)
        logger.error(
            "dataexport.error: %s",
            six.text_type(error),
            extra={
                "query": data_export.payload,
                "org": data_export.organization_id
            },
        )
        capture_exception(error)

        try:
            current.retry()
        except MaxRetriesExceededError:
            return data_export.email_failure(
                message="Internal processing failure")
    else:
        if (rows and len(rows) >= batch_size and new_bytes_written
                and (export_limit is None or next_offset < export_limit)):
            assemble_download.delay(
                data_export_id,
                export_limit=export_limit,
                batch_size=batch_size,
                offset=next_offset,
                bytes_written=bytes_written,
                environment_id=environment_id,
            )
        else:
            merge_export_blobs.delay(data_export_id)
Пример #38
0
 def wrapped(*args, **kwargs):
     try:
         return func(*args, **kwargs)
     except Exception as exc:
         current.retry(exc=exc)
Пример #39
0
            sample_url=sample.url,
            job_id=sample.job_id,
        )
    except subprocess.CalledProcessError, e:
        # Something wrong has happened to links. Couldn't find documentation on
        # error codes - assume bad stuff has happened that retrying won't fix.
        send_event(
            'EventSampleContentFail',
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
            error_code=e.returncode
        )
        return False
    except DatabaseError, e:
        current.retry(exc=e, countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    return text != ''


@task()
def web_screenshot_extraction(sample_id, url=None, *args, **kwargs):
    """ Generates html output from those browsers.
    """
    if url is None:
        url = Sample.objects.get(id=sample_id).url

    if not is_proper_url(url):
        return False

    sample = Sample.objects.get(id=sample_id)
Пример #40
0
def assemble_download(data_export_id,
                      export_limit=EXPORTED_ROWS_LIMIT,
                      batch_size=SNUBA_MAX_RESULTS,
                      offset=0,
                      bytes_written=0,
                      environment_id=None,
                      **kwargs):
    first_page = offset == 0

    try:
        if first_page:
            logger.info("dataexport.start",
                        extra={"data_export_id": data_export_id})
        data_export = ExportedData.objects.get(id=data_export_id)
        if first_page:
            metrics.incr("dataexport.start",
                         tags={"success": True},
                         sample_rate=1.0)
        logger.info("dataexport.run",
                    extra={
                        "data_export_id": data_export_id,
                        "offset": offset
                    })
    except ExportedData.DoesNotExist as error:
        if first_page:
            metrics.incr("dataexport.start",
                         tags={"success": False},
                         sample_rate=1.0)
        logger.exception(error)
        return

    try:
        # ensure that the export limit is set and capped at EXPORTED_ROWS_LIMIT
        if export_limit is None:
            export_limit = EXPORTED_ROWS_LIMIT
        else:
            export_limit = min(export_limit, EXPORTED_ROWS_LIMIT)

        processor = get_processor(data_export, environment_id)

        with tempfile.TemporaryFile() as tf:
            writer = csv.DictWriter(tf,
                                    processor.header_fields,
                                    extrasaction="ignore")
            if first_page:
                writer.writeheader()

            # the position in the file at the end of the headers
            starting_pos = tf.tell()

            # the row offset relative to the start of the current task
            # this offset tells you the number of rows written during this batch fragment
            fragment_offset = 0

            # the absolute row offset from the beginning of the export
            next_offset = offset + fragment_offset

            while True:
                # the number of rows to export in the next batch fragment
                fragment_row_count = min(batch_size,
                                         max(export_limit - next_offset, 1))

                rows = process_rows(processor, data_export, fragment_row_count,
                                    next_offset)
                writer.writerows(rows)

                fragment_offset += len(rows)
                next_offset = offset + fragment_offset

                if (not rows or len(rows) < batch_size
                        # the batch may exceed MAX_BATCH_SIZE but immediately stops
                        or tf.tell() - starting_pos >= MAX_BATCH_SIZE):
                    break

            tf.seek(0)
            new_bytes_written = store_export_chunk_as_blob(
                data_export, bytes_written, tf)
            bytes_written += new_bytes_written
    except ExportError as error:
        return data_export.email_failure(message=six.text_type(error))
    except Exception as error:
        metrics.incr("dataexport.error",
                     tags={"error": six.text_type(error)},
                     sample_rate=1.0)
        logger.error(
            "dataexport.error: %s",
            six.text_type(error),
            extra={
                "query": data_export.payload,
                "org": data_export.organization_id
            },
        )
        capture_exception(error)

        try:
            current.retry()
        except MaxRetriesExceededError:
            metrics.incr(
                "dataexport.end",
                tags={
                    "success": False,
                    "error": six.text_type(error)
                },
                sample_rate=1.0,
            )
            return data_export.email_failure(
                message="Internal processing failure")
    else:
        if rows and len(
                rows
        ) >= batch_size and new_bytes_written and next_offset < export_limit:
            assemble_download.delay(
                data_export_id,
                export_limit=export_limit,
                batch_size=batch_size,
                offset=next_offset,
                bytes_written=bytes_written,
                environment_id=environment_id,
            )
        else:
            metrics.timing("dataexport.row_count", next_offset)
            metrics.timing("dataexport.file_size", bytes_written)
            merge_export_blobs.delay(data_export_id)