Пример #1
0
 def link_to_log(self, log, datum_ids):
     if not datum_ids:
         return True
     logger.debug('Linking RawData to %r', log)
     with connection.cursor() as cursor:
         for chunk in chunked(datum_ids, size=500):
             if not chunk:
                 break
             cursor.execute(
                 '''
                 INSERT INTO "{table}"
                     ("{rawdatum}", "{harvestlog}")
                 VALUES
                     {values}
                 ON CONFLICT ("{rawdatum}", "{harvestlog}") DO NOTHING;
             '''.format(
                     values=', '.join(
                         '%s' for _ in range(len(chunk))
                     ),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                     table=RawDatum.logs.through._meta.db_table,
                     rawdatum=RawDatum.logs.through._meta.get_field(
                         'rawdatum').column,
                     harvestlog=RawDatum.logs.through._meta.get_field(
                         'harvestlog').column,
                 ),
                 [(raw_id, log.id) for raw_id in chunk])
     return True
Пример #2
0
    def bulk_get_or_create(self, objs, defaults=None, using='default'):
        if len(self.model._meta.unique_together) != 1:
            raise ValueError(
                'Cannot determine the constraint to use for ON CONFLICT')

        if not objs:
            return []

        columns = []
        defaults = defaults or {}

        for field in self.model._meta.concrete_fields:
            if field is not self.model._meta.pk:
                columns.append(field.column)
            if field in defaults:
                continue
            if field.default is not models.NOT_PROVIDED or field.null:
                defaults[field] = field._get_default()
            elif isinstance(field, models.DateField) and (field.auto_now or
                                                          field.auto_now_add):
                defaults[field] = timezone.now()

        if any(obj.pk for obj in objs):
            raise ValueError(
                'Cannot bulk_get_or_create objects with primary keys')

        constraint = ', '.join(
            '"{1.column}"'.format(self.model, self.model._meta.get_field(
                field)) for field in self.model._meta.unique_together[0])

        loaded = []
        with transaction.atomic(using):
            for chunk in chunked(objs, 500):
                if not chunk:
                    break
                loaded.extend(
                    self.raw(
                        '''
                    INSERT INTO "{model._meta.db_table}"
                        ({columns})
                    VALUES
                        {values}
                    ON CONFLICT
                        ({constraint})
                    DO UPDATE SET
                        id = "{model._meta.db_table}".id
                    RETURNING *
                '''.format(
                            model=self.model,
                            columns=', '.join(columns),
                            constraint=constraint,
                            values=', '.join(['%s'] * len(chunk)),
                        ), [
                            tuple(
                                getattr(obj, field.attname, None)
                                or defaults[field] for field in
                                self.model._meta.concrete_fields[1:])
                            for obj in chunk
                        ]))
        return loaded
Пример #3
0
 def __iter__(self):
     opts = {'_index': self._index, '_type': self._model._meta.verbose_name_plural.replace(' ', '')}
     for chunk in util.chunked(self._flatten(), size=250):
         for result in self._fetcher(chunk):
             if result is None:
                 yield None
             elif result.pop('is_deleted', False):
                 yield {'_id': result['id'], '_op_type': 'delete', **opts}
             else:
                 yield {'_id': result['id'], '_op_type': 'index', **opts, **result}
Пример #4
0
    def _bulk_query(self, query, default_values, data, db_alias):
        fields = [field.name for field in self.model._meta.concrete_fields]

        with connection.cursor() as cursor:
            for chunk in chunked(data, 500):
                if not chunk:
                    break
                cursor.execute(query.format(
                    values=', '.join('%s' for _ in range(len(chunk))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                ), [c + default_values for c in chunk])

                for row in cursor.fetchall():
                    yield self.model.from_db(db_alias, fields, row)
Пример #5
0
    def _bulk_query(self, query, default_values, data, db_alias):
        fields = [field.name for field in self.model._meta.concrete_fields]

        with connection.cursor() as cursor:
            for chunk in chunked(data, 500):
                if not chunk:
                    break
                cursor.execute(query.format(
                    values=', '.join('%s' for _ in range(len(chunk))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                ), [c + default_values for c in chunk])

                for row in cursor.fetchall():
                    yield self.model.from_db(db_alias, fields, row)
Пример #6
0
 def __iter__(self):
     opts = {
         '_index': self._index,
         '_type': self._model._meta.verbose_name_plural.replace(' ', '')
     }
     for chunk in util.chunked(self._flatten(), size=self._size):
         for result in self._fetcher(chunk):
             if result is None:
                 yield None
             elif result.pop('is_deleted', False):
                 yield {'_id': result['id'], '_op_type': 'delete', **opts}
             else:
                 yield {
                     '_id': result['id'],
                     '_op_type': 'index',
                     **opts,
                     **result
                 }
Пример #7
0
    def archive_queryset(self, task_name, queryset):
        if self.bucket is None:
            logger.warning('%r.bucket is None. Results will NOT be archived', self)
            return None

        if task_name in self.NO_ARCHIVE:
            logger.info('Found %s in NO_ARCHIVE, archival will be skipped', task_name)

        total = queryset.count()
        logger.info('Found %s %ss eligible for archiving', total, task_name)
        logger.info('Archiving in chunks of %d', self.chunk_size)

        i = 0
        for chunk in chunked(queryset.iterator(), size=self.chunk_size):
            compressed = self.compress_and_serialize(chunk)
            self.put_s3(task_name, compressed)
            i += len(chunk)
            logger.info('Archived %d of %d', i, total)
Пример #8
0
    def delete_queryset(self, queryset):
        if not self.delete:
            logger.warning('%r.delete is False. Results will NOT be deleted', self)
            return 0

        total_deleted = 0

        try:
            with transaction.atomic():
                # .delete loads the entire queryset and can't be sliced... Hooray
                for ids in chunked(queryset.values_list('id', flat=True).iterator(), size=self.chunk_size):
                    num_deleted, _ = queryset.model.objects.filter(id__in=ids).delete()
                    total_deleted += num_deleted
        except Exception as e:
            logger.exception('Failed to delete queryset with exception %s', e)
            raise

        logger.info('Deleted %s CeleryTasks', total_deleted)
        return total_deleted
Пример #9
0
    def archive_queryset(self, task_name, queryset):
        if self.bucket is None:
            logger.warning('%r.bucket is None. Results will NOT be archived',
                           self)
            return None

        if task_name in self.NO_ARCHIVE:
            logger.info('Found %s in NO_ARCHIVE, archival will be skipped',
                        task_name)

        total = queryset.count()
        logger.info('Found %s %ss eligible for archiving', total, task_name)
        logger.info('Archiving in chunks of %d', self.chunk_size)

        i = 0
        for chunk in chunked(queryset.iterator(), size=self.chunk_size):
            compressed = self.compress_and_serialize(chunk)
            self.put_s3(task_name, compressed)
            i += len(chunk)
            logger.info('Archived %d of %d', i, total)
Пример #10
0
 def _consume_job(self, job, force, superfluous, limit=None, ingest=True):
     try:
         if ingest:
             datum_gen = (datum for datum in self._harvest(job, force, limit) if datum.created or superfluous)
             for chunk in chunked(datum_gen, 500):
                 self._bulk_schedule_ingest(job, chunk)
         else:
             for _ in self._harvest(job, force, limit):
                 pass
     except HarvesterConcurrencyError as e:
         if not self.task:
             raise
         # If job_id was specified there's a chance that the advisory lock was not, in fact, acquired.
         # If so, retry indefinitely to preserve existing functionality.
         # Use random to add jitter to help break up locking issues
         # Kinda hacky, allow a stupidly large number of retries as there is no options for infinite
         raise self.task.retry(
             exc=e,
             max_retries=99999,
             countdown=(random.random() + 1) * min(settings.CELERY_RETRY_BACKOFF_BASE ** self.task.request.retries, 60 * 15)
         )
Пример #11
0
 def link_to_log(self, log, datum_ids):
     if not datum_ids:
         return True
     logger.debug('Linking RawData to %r', log)
     with connection.cursor() as cursor:
         for chunk in chunked(datum_ids, size=500):
             if not chunk:
                 break
             cursor.execute('''
                 INSERT INTO "{table}"
                     ("{rawdatum}", "{harvestlog}")
                 VALUES
                     {values}
                 ON CONFLICT ("{rawdatum}", "{harvestlog}") DO NOTHING;
             '''.format(
                 values=', '.join('%s' for _ in range(len(chunk))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                 table=RawDatum.logs.through._meta.db_table,
                 rawdatum=RawDatum.logs.through._meta.get_field('rawdatum').column,
                 harvestlog=RawDatum.logs.through._meta.get_field('harvestlog').column,
             ), [(raw_id, log.id) for raw_id in chunk])
     return True
Пример #12
0
 def _consume_job(self, job, force, superfluous, limit=None, ingest=True):
     try:
         if ingest:
             datum_gen = (datum for datum in self._harvest(job, force, limit) if datum.created or superfluous)
             for chunk in chunked(datum_gen, 500):
                 self._bulk_schedule_ingest(job, chunk)
         else:
             for _ in self._harvest(job, force, limit):
                 pass
     except HarvesterConcurrencyError as e:
         if not self.task:
             raise
         # If job_id was specified there's a chance that the advisory lock was not, in fact, acquired.
         # If so, retry indefinitely to preserve existing functionality.
         # Use random to add jitter to help break up locking issues
         # Kinda hacky, allow a stupidly large number of retries as there is no options for infinite
         raise self.task.retry(
             exc=e,
             max_retries=99999,
             countdown=(random.random() + 1) * min(settings.CELERY_RETRY_BACKOFF_BASE ** self.task.request.retries, 60 * 15)
         )
Пример #13
0
    def delete_queryset(self, queryset):
        if not self.delete:
            logger.warning('%r.delete is False. Results will NOT be deleted',
                           self)
            return 0

        total_deleted = 0

        try:
            with transaction.atomic():
                # .delete loads the entire queryset and can't be sliced... Hooray
                for ids in chunked(queryset.values_list('id',
                                                        flat=True).iterator(),
                                   size=self.chunk_size):
                    num_deleted, _ = queryset.model.objects.filter(
                        id__in=ids).delete()
                    total_deleted += num_deleted
        except Exception as e:
            logger.exception('Failed to delete queryset with exception %s', e)
            raise

        logger.info('Deleted %s CeleryTasks', total_deleted)
        return total_deleted
Пример #14
0
    def store_chunk(self, source_config, data, limit=None, db=DEFAULT_DB_ALIAS):
        """Store a large amount of data for a single source_config.

        Data MUST be a utf-8 encoded string (Just a str type).
        Take special care to make sure you aren't destroying data by mis-encoding it.

        Args:
            source_config (SourceConfig):
            data Generator[(str, str)]: (identifier, datum)

        Returns:
            Generator[RawDatum]
        """
        hashes = {}
        identifiers = {}
        now = timezone.now()

        if limit == 0:
            return []

        for chunk in chunked(data, 500):
            if not chunk:
                break

            new = []
            new_identifiers = set()
            for fr in chunk:
                if limit and len(hashes) >= limit:
                    break

                if fr.sha256 in hashes:
                    if hashes[fr.sha256] != fr.identifier:
                        raise ValueError(
                            '{!r} has already been seen or stored with identifier "{}". '
                            'Perhaps your identifier extraction is incorrect?'.format(fr, hashes[fr.sha256])
                        )
                    logger.warning('Recieved duplicate datum %s from %s', fr, source_config)
                    continue

                new.append(fr)
                hashes[fr.sha256] = fr.identifier
                new_identifiers.add(fr.identifier)

            if new_identifiers:
                suids = SourceUniqueIdentifier.objects.raw('''
                    INSERT INTO "{table}"
                        ("{identifier}", "{source_config}")
                    VALUES
                        {values}
                    ON CONFLICT
                        ("{identifier}", "{source_config}")
                    DO UPDATE SET
                        id = "{table}".id
                    RETURNING {fields}
                '''.format(
                    table=SourceUniqueIdentifier._meta.db_table,
                    identifier=SourceUniqueIdentifier._meta.get_field('identifier').column,
                    source_config=SourceUniqueIdentifier._meta.get_field('source_config').column,
                    values=placeholders(len(new_identifiers)),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                    fields=', '.join('"{}"'.format(field.column) for field in SourceUniqueIdentifier._meta.concrete_fields),
                ), [(identifier, source_config.id) for identifier in new_identifiers])

                for suid in suids:
                    identifiers[suid.identifier] = suid.pk

            if new:
                # Defer 'datum' by omitting it from the returned fields
                yield from RawDatum.objects.raw(
                    '''
                        INSERT INTO "{table}"
                            ("{suid}", "{hash}", "{datum}", "{datestamp}", "{date_modified}", "{date_created}")
                        VALUES
                            {values}
                        ON CONFLICT
                            ("{suid}", "{hash}")
                        DO UPDATE SET
                            "{datestamp}" = EXCLUDED."{datestamp}",
                            "{date_modified}" = EXCLUDED."{date_modified}"
                        RETURNING id, "{suid}", "{hash}", "{datestamp}", "{date_modified}", "{date_created}"
                    '''.format(
                        table=RawDatum._meta.db_table,
                        suid=RawDatum._meta.get_field('suid').column,
                        hash=RawDatum._meta.get_field('sha256').column,
                        datum=RawDatum._meta.get_field('datum').column,
                        datestamp=RawDatum._meta.get_field('datestamp').column,
                        date_modified=RawDatum._meta.get_field('date_modified').column,
                        date_created=RawDatum._meta.get_field('date_created').column,
                        values=', '.join('%s' for _ in range(len(new))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                    ), [
                        (identifiers[fr.identifier], fr.sha256, fr.datum, fr.datestamp or now, now, now)
                        for fr in new
                    ]
                )

            if limit and len(hashes) >= limit:
                break
Пример #15
0
    def store_chunk(self, source_config, data, limit=None, db=DEFAULT_DB_ALIAS):
        """Store a large amount of data for a single source_config.

        Data MUST be a utf-8 encoded string (Just a str type).
        Take special care to make sure you aren't destroying data by mis-encoding it.

        Args:
            source_config (SourceConfig):
            data Generator[(str, str)]: (identifier, datum)

        Returns:
            Generator[MemoryFriendlyRawDatum]
        """
        unique_data = set()
        now = timezone.now()

        with connection.cursor() as cursor:
            for chunk in chunked(data, 500):
                chunk_data = []
                for identifier, datum in chunk:
                    if limit is not None and len(unique_data) >= limit:
                        break
                    hash_ = sha256(datum.encode('utf-8')).hexdigest()
                    chunk_data.append((identifier, hash_, datum))
                    unique_data.add((identifier, hash_))

                if not chunk_data:
                    break

                identifiers = list({(identifier, source_config.id) for identifier, _, _ in chunk_data})

                cursor.execute('''
                    INSERT INTO "{table}"
                        ("{identifier}", "{source_config}")
                    VALUES
                        {values}
                    ON CONFLICT
                        ("{identifier}", "{source_config}")
                    DO UPDATE SET
                        id = "{table}".id
                    RETURNING {fields}
                '''.format(
                    table=SourceUniqueIdentifier._meta.db_table,
                    identifier=SourceUniqueIdentifier._meta.get_field('identifier').column,
                    source_config=SourceUniqueIdentifier._meta.get_field('source_config').column,
                    values=', '.join('%s' for _ in range(len(identifiers))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                    fields=', '.join('"{}"'.format(field.column) for field in SourceUniqueIdentifier._meta.concrete_fields),
                ), identifiers)

                suids = {}
                fields = [field.attname for field in SourceUniqueIdentifier._meta.concrete_fields]
                for row in cursor.fetchall():
                    suid = SourceUniqueIdentifier.from_db(db, fields, row)
                    suids[suid.pk] = suid
                    suids[suid.identifier] = suid

                raw_data = {}
                for identifier, hash_, datum in chunk_data:
                    raw_data[identifier, hash_] = (suids[identifier].pk, hash_, datum, now, now)

                cursor.execute('''
                    INSERT INTO "{table}"
                        ("{suid}", "{hash}", "{datum}", "{date_created}", "{date_modified}")
                    VALUES
                        {values}
                    ON CONFLICT
                        ("{suid}", "{hash}")
                    DO UPDATE SET
                        "{date_modified}" = %s
                    RETURNING id, "{suid}", "{hash}", "{date_created}", "{date_modified}"
                '''.format(
                    table=RawDatum._meta.db_table,
                    suid=RawDatum._meta.get_field('suid').column,
                    hash=RawDatum._meta.get_field('sha256').column,
                    datum=RawDatum._meta.get_field('datum').column,
                    date_created=RawDatum._meta.get_field('date_created').column,
                    date_modified=RawDatum._meta.get_field('date_modified').column,
                    values=', '.join('%s' for _ in range(len(raw_data))),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                ), list(raw_data.values()) + [now])

                for row in cursor.fetchall():
                    yield MemoryFriendlyRawDatum.from_db(db, ('id', 'suid', 'sha256', 'date_created', 'date_modified'), row[:1] + (suids[row[1]], ) + row[2:])

                if limit is not None and len(unique_data) >= limit:
                    break
Пример #16
0
    def store_chunk(self,
                    source_config,
                    data,
                    limit=None,
                    db=DEFAULT_DB_ALIAS):
        """Store a large amount of data for a single source_config.

        Data MUST be a utf-8 encoded string (Just a str type).
        Take special care to make sure you aren't destroying data by mis-encoding it.

        Args:
            source_config (SourceConfig):
            data Generator[(str, str)]: (identifier, datum)

        Returns:
            Generator[RawDatum]
        """
        hashes = {}
        identifiers = {}
        now = timezone.now()

        if limit == 0:
            return []

        for chunk in chunked(data, 500):
            if not chunk:
                break

            new = []
            new_identifiers = set()
            for fr in chunk:
                if limit and len(hashes) >= limit:
                    break

                if fr.sha256 in hashes:
                    if hashes[fr.sha256] != fr.identifier:
                        raise ValueError(
                            '{!r} has already been seen or stored with identifier "{}". '
                            'Perhaps your identifier extraction is incorrect?'.
                            format(fr, hashes[fr.sha256]))
                    logger.warning('Recieved duplicate datum %s from %s', fr,
                                   source_config)
                    continue

                new.append(fr)
                hashes[fr.sha256] = fr.identifier
                new_identifiers.add(fr.identifier)

            if new_identifiers:
                suids = SourceUniqueIdentifier.objects.raw(
                    '''
                    INSERT INTO "{table}"
                        ("{identifier}", "{source_config}")
                    VALUES
                        {values}
                    ON CONFLICT
                        ("{identifier}", "{source_config}")
                    DO UPDATE SET
                        id = "{table}".id
                    RETURNING {fields}
                '''.format(
                        table=SourceUniqueIdentifier._meta.db_table,
                        identifier=SourceUniqueIdentifier._meta.get_field(
                            'identifier').column,
                        source_config=SourceUniqueIdentifier._meta.get_field(
                            'source_config').column,
                        values=placeholders(
                            len(new_identifiers)
                        ),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                        fields=', '.join(
                            '"{}"'.format(field.column) for field in
                            SourceUniqueIdentifier._meta.concrete_fields),
                    ),
                    [(identifier, source_config.id)
                     for identifier in new_identifiers])

                for suid in suids:
                    identifiers[suid.identifier] = suid.pk

            if new:
                # Defer 'datum' by omitting it from the returned fields
                yield from RawDatum.objects.raw(
                    '''
                        INSERT INTO "{table}"
                            ("{suid}", "{hash}", "{datum}", "{datestamp}", "{date_modified}", "{date_created}")
                        VALUES
                            {values}
                        ON CONFLICT
                            ("{suid}", "{hash}")
                        DO UPDATE SET
                            "{datestamp}" = EXCLUDED."{datestamp}",
                            "{date_modified}" = EXCLUDED."{date_modified}"
                        RETURNING id, "{suid}", "{hash}", "{datestamp}", "{date_modified}", "{date_created}"
                    '''.format(
                        table=RawDatum._meta.db_table,
                        suid=RawDatum._meta.get_field('suid').column,
                        hash=RawDatum._meta.get_field('sha256').column,
                        datum=RawDatum._meta.get_field('datum').column,
                        datestamp=RawDatum._meta.get_field('datestamp').column,
                        date_modified=RawDatum._meta.get_field(
                            'date_modified').column,
                        date_created=RawDatum._meta.get_field(
                            'date_created').column,
                        values=', '.join(
                            '%s' for _ in range(len(new))
                        ),  # Nasty hack. Fix when psycopg2 2.7 is released with execute_values
                    ),
                    [(identifiers[fr.identifier], fr.sha256, fr.datum,
                      fr.datestamp, now, now) for fr in new])

            if limit and len(hashes) >= limit:
                break
Пример #17
0
    def bulk_get_or_create(self,
                           objs,
                           defaults=None,
                           using='default',
                           update_fields=None,
                           defer_fields=None,
                           chunk_size=500,
                           ):
        if len(self.model._meta.unique_together) != 1:
            raise ValueError('Cannot determine the constraint to use for ON CONFLICT')

        def col(field_name):
            return self.model._meta.get_field(field_name).column

        columns = []
        field_names = []
        defaults = defaults or {}

        for field in self.model._meta.concrete_fields:
            if field is not self.model._meta.pk:
                columns.append(field.column)
                field_names.append(field.attname)
            if field in defaults:
                continue
            if field.default is not models.NOT_PROVIDED or field.null:
                defaults[field.attname] = field._get_default()
            elif isinstance(field, models.DateField) and (field.auto_now or field.auto_now_add):
                defaults[field.attname] = timezone.now()

        constraint = ', '.join(
            '"{}"'.format(col(f))
            for f in self.model._meta.unique_together[0]
        )

        if update_fields:
            update = [
                '"{0}" = EXCLUDED."{0}"'.format(col(f))
                for f in update_fields
            ]
        else:
            update = ['id = "{}".id'.format(self.model._meta.db_table)]

        returning = '*'
        if defer_fields:
            defer_columns = {col(f) for f in defer_fields}
            returning = ', '.join(['id'] + [c for c in columns if c not in defer_columns])

        loaded = []
        with transaction.atomic(using):
            for chunk in chunked(objs, chunk_size):
                if not chunk:
                    break
                loaded.extend(self.raw('''
                    INSERT INTO "{model._meta.db_table}"
                        ({columns})
                    VALUES
                        {values}
                    ON CONFLICT
                        ({constraint})
                    DO UPDATE SET
                        {update}
                    RETURNING
                        {returning}
                '''.format(
                    model=self.model,
                    columns=', '.join(columns),
                    constraint=constraint,
                    values=', '.join(['%s'] * len(chunk)),
                    update=', '.join(update),
                    returning=returning,
                ), [
                    tuple(getattr(obj, f, None) or defaults[f] for f in field_names)
                    for obj in chunk
                ]))
        return loaded