Exemplo n.º 1
0
    def add_row(self, user, data, external_id=None):
        """
        Add (or overwrite) a row to this dataset.
        """
        self.lock()

        try:
            data_typer = DataTyper(self.column_schema)

            solr_row = utils.solr.make_data_row(self, data, external_id=external_id)
            solr_row = data_typer(solr_row, data)

            solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True)

            self.schema = data_typer.schema

            if not self.sample_data:
                self.sample_data = []

            if len(self.sample_data) < 5:
                self.sample_data.append(data)

            old_row_count = self.row_count
            self.row_count = self._count_rows()
            added = self.row_count - (old_row_count or 0)
            self.last_modified = datetime.utcnow()
            self.last_modified_by = user
            self.last_modification = "1 row %s" % ("added" if added else "updated")
            self.save()

            return solr_row
        finally:
            self.unlock()
Exemplo n.º 2
0
    def add_many_rows(self, user, data):
        """
        Shortcut for adding rows in bulk. 

        ``data`` must be an array of tuples in the format (data_array, external_id)
        """
        solr_rows = [utils.solr.make_data_row(self, d[0], external_id=d[1]) for d in data]

        solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True)

        if not self.sample_data:
            self.sample_data = []
        
        if len(self.sample_data) < 5:
            needed = 5 - len(self.sample_data)
            self.sample_data.extend([d[0] for d in data[:needed]])

        old_row_count = self.row_count
        self.row_count = self._count_rows()
        added = self.row_count - (old_row_count or 0)
        updated = len(data) - added
        self.last_modified = datetime.utcnow()
        self.last_modified_by = user

        if added and updated: 
            self.last_modification = '%i rows added and %i updated' % (added, updated)
        elif added:
            self.last_modification = '%i rows added' % added
        else:
            self.last_modification = '%i rows updated' % updated

        self.save()

        return solr_rows
Exemplo n.º 3
0
    def add_many_rows(self, user, data):
        """
        Shortcut for adding rows in bulk. 

        ``data`` must be an array of tuples in the format (data_array, external_id)
        """
        self.lock()

        try:
            data_typer = DataTyper(self.column_schema)

            solr_rows = [
                utils.solr.make_data_row(self, d[0], external_id=d[1])
                for d in data
            ]
            solr_rows = [data_typer(s, d[0]) for s, d in zip(solr_rows, data)]

            solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True)

            self.schema = data_typer.schema

            if not self.sample_data:
                self.sample_data = []

            if len(self.sample_data) < 5:
                needed = 5 - len(self.sample_data)
                self.sample_data.extend([d[0] for d in data[:needed]])

            old_row_count = self.row_count
            self.row_count = self._count_rows()
            added = self.row_count - (old_row_count or 0)
            updated = len(data) - added
            self.last_modified = now()
            self.last_modified_by = user

            if added and updated:
                self.last_modification = _('%(added)i rows added and %(updated)i updated') \
                    % {'added': added, 'updated': updated}
            elif added:
                self.last_modification = _('%i rows added') % added
            else:
                self.last_modification = _('%i rows updated') % updated

            self.save()

            return solr_rows
        finally:
            self.unlock()
Exemplo n.º 4
0
    def add_many_rows(self, user, data):
        """
        Shortcut for adding rows in bulk. 

        ``data`` must be an array of tuples in the format (data_array, external_id)
        """
        self.lock()

        try:
            data_typer = DataTyper(self.column_schema)

            solr_rows = [utils.solr.make_data_row(self, d[0], external_id=d[1]) for d in data]
            solr_rows = [data_typer(s, d[0]) for s, d in zip(solr_rows, data)]

            solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True)
            
            self.schema = data_typer.schema

            if not self.sample_data:
                self.sample_data = []
            
            if len(self.sample_data) < 5:
                needed = 5 - len(self.sample_data)
                self.sample_data.extend([d[0] for d in data[:needed]])

            old_row_count = self.row_count
            self.row_count = self._count_rows()
            added = self.row_count - (old_row_count or 0)
            updated = len(data) - added
            self.last_modified = now()
            self.last_modified_by = user

            if added and updated: 
                self.last_modification = _('%(added)i rows added and %(updated)i updated') \
                    % {'added': added, 'updated': updated}
            elif added:
                self.last_modification = _('%i rows added') % added
            else:
                self.last_modification = _('%i rows updated') % updated

            self.save()

            return solr_rows
        finally:
            self.unlock()
Exemplo n.º 5
0
    def update_full_text(self, commit=True):
        """
        Update the full-text search metadata for this dataset stored in Solr.
        """
        category_ids = []

        full_text_data = [
            self.name,
            self.description,
            "%s %s" % (self.creator.first_name, self.creator.last_name),
            self.creator.email,
        ]

        for category in self.categories.all():
            category_ids.append(category.id)
            full_text_data.append(category.name)

        if not category_ids:
            category_ids.append(settings.PANDA_UNCATEGORIZED_ID)
            full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME)

        for data_upload in self.data_uploads.all():
            full_text_data.append(data_upload.original_filename)

        for related_upload in self.related_uploads.all():
            full_text_data.append(related_upload.original_filename)

        if self.column_schema is not None:
            full_text_data.extend([c["name"] for c in self.column_schema])

        full_text = "\n".join(full_text_data)

        solr.add(
            settings.SOLR_DATASETS_CORE,
            [
                {
                    "slug": self.slug,
                    "creation_date": self.creation_date.isoformat() + "Z",
                    "categories": category_ids,
                    "full_text": full_text,
                }
            ],
            commit=commit,
        )
Exemplo n.º 6
0
    def update_full_text(self, commit=True):
        """
        Update the full-text search metadata for this dataset stored in Solr.
        """
        category_ids = []

        full_text_data = [
            unquote(self.name),
            unquote(self.description),
            '%s %s' % (self.creator.first_name, self.creator.last_name),
            self.creator.email
        ]

        for category in self.categories.all():
            category_ids.append(category.id)
            full_text_data.append(category.name)

        if not category_ids:
            category_ids.append(settings.PANDA_UNCATEGORIZED_ID)
            full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME)

        for data_upload in self.data_uploads.all():
            full_text_data.append(data_upload.original_filename)

        for related_upload in self.related_uploads.all():
            full_text_data.append(related_upload.original_filename)

        if self.column_schema is not None:
            full_text_data.extend([c['name'] for c in self.column_schema])

        full_text = u'\n'.join(map(
            unicode, full_text_data))  # convert any i18n proxies into strings

        solr.add(settings.SOLR_DATASETS_CORE,
                 [{
                     'slug': self.slug,
                     'creation_date': self.creation_date.isoformat() + 'Z',
                     'categories': category_ids,
                     'full_text': full_text
                 }],
                 commit=commit)
Exemplo n.º 7
0
    def update_full_text(self, commit=True):
        """
        Update the full-text search metadata for this dataset stored in Solr.
        """
        category_ids = []

        full_text_data = [
            unquote(self.name),
            unquote(self.description),
            '%s %s' % (self.creator.first_name, self.creator.last_name),
            self.creator.email
        ]

        for category in self.categories.all():
            category_ids.append(category.id)
            full_text_data.append(category.name)

        if not category_ids:
            category_ids.append(settings.PANDA_UNCATEGORIZED_ID)
            full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME)

        for data_upload in self.data_uploads.all():
            full_text_data.append(data_upload.original_filename)

        for related_upload in self.related_uploads.all():
            full_text_data.append(related_upload.original_filename)

        if self.column_schema is not None:
            full_text_data.extend([c['name'] for c in self.column_schema])

        full_text = '\n'.join(full_text_data)

        print full_text

        solr.add(settings.SOLR_DATASETS_CORE, [{
            'slug': self.slug,
            'creation_date': self.creation_date.isoformat() + 'Z',
            'categories': category_ids,
            'full_text': full_text
        }], commit=commit)
Exemplo n.º 8
0
    def add_row(self, user, data, external_id=None):
        """
        Add (or overwrite) a row to this dataset.
        """
        solr_row = utils.solr.make_data_row(self, data, external_id=external_id)

        solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True)

        if not self.sample_data:
            self.sample_data = []
        
        if len(self.sample_data) < 5:
            self.sample_data.append(data)

        old_row_count = self.row_count
        self.row_count = self._count_rows()
        added = self.row_count - (old_row_count or 0)
        self.last_modified = datetime.utcnow()
        self.last_modified_by = user
        self.last_modification = '1 row %s' % ('added' if added else 'updated')
        self.save()

        return solr_row
Exemplo n.º 9
0
    def add_row(self, user, data, external_id=None):
        """
        Add (or overwrite) a row to this dataset.
        """
        self.lock()

        try:
            data_typer = DataTyper(self.column_schema)

            solr_row = utils.solr.make_data_row(self,
                                                data,
                                                external_id=external_id)
            solr_row = data_typer(solr_row, data)

            solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True)

            self.schema = data_typer.schema

            if not self.sample_data:
                self.sample_data = []

            if len(self.sample_data) < 5:
                self.sample_data.append(data)

            old_row_count = self.row_count
            self.row_count = self._count_rows()
            added = self.row_count - (old_row_count or 0)
            self.last_modified = now()
            self.last_modified_by = user
            self.last_modification = _('1 row %s') % ('added'
                                                      if added else 'updated')
            self.save()

            return solr_row
        finally:
            self.unlock()
Exemplo n.º 10
0
    def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload
        
        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)
    
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to import'))

        book = xlrd.open_workbook(upload.get_path(), on_demand=True)
        sheet = book.sheet_by_index(0)
        row_count = sheet.nrows
        
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        for i in range(1, row_count):
            values = sheet.row_values(i)
            types = sheet.row_types(i)

            normal_values = []

            for v, t in zip(values, types):
                if t == xlrd.biffh.XL_CELL_DATE:
                    v = utils.xls.normalize_date(v, book.datemode)
                elif t == xlrd.biffh.XL_CELL_NUMBER:
                    if v % 1 == 0:
                        v = int(v)

                normal_values.append(unicode(v))

            external_id = None

            if external_id_field_index is not None:
                external_id = values[external_id_field_index]

            data = utils.solr.make_data_row(dataset, normal_values, data_upload=upload, external_id=external_id)
            data = data_typer(data, normal_values)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)
                add_buffer = []

                task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(row_count) * 100))

                if self.is_aborted():
                    task_status.abort(ugettext('Aborted after importing %.0f%%') % floor(float(i) / float(row_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

                    return
            
                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)
        
        return data_typer
Exemplo n.º 11
0
    def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload
        
        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)
        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin('Preparing to import')

        line_count = self._count_lines(upload.get_path())

        if self.is_aborted():
            task_status.abort('Aborted during preperation')

            log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

            return

        f = open(upload.get_path(), 'r')

        reader = CSVKitReader(f, encoding=upload.encoding, **upload.dialect_as_parameters())
        reader.next()

        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while True:
            # The row number which is about to be read, for error handling and indexing
            i += 1

            try:
                row = reader.next()
            except StopIteration:
                i -= 1
                break
            except UnicodeDecodeError:
                raise DataImportError('This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.' % (upload.encoding, i))

            external_id = None

            if external_id_field_index is not None:
                external_id = row[external_id_field_index]

            data = utils.solr.make_data_row(dataset, row, external_id=external_id)
            data = data_typer(data, row)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update('%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100))

                if self.is_aborted():
                    task_status.abort('Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

                    return

                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        f.close()

        task_status.update('100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        dataset = Dataset.objects.get(slug=dataset_slug)

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 12
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset
        
        log = logging.getLogger(self.name)
        log.info('Beginning reindex, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to reindex'))

        if self.is_aborted():
            task_status.abort(ugettext('Aborted during preparation'))

            log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = 'dataset_slug: %s' % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i)
                read_buffer = response['response']['docs']

            data = read_buffer.pop(0)
            row = json.loads(data['data'])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data['id'] = data['id'] 
            new_data['data_upload_id'] = data['data_upload_id']
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100))

                    log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

                    return
            
                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        dataset.column_schema = data_typer.schema 
        dataset.save()

        log.info('Finished reindex, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 13
0
    def run(self,
            dataset_slug,
            upload_id,
            external_id_field_index=None,
            *args,
            **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload

        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin('Preparing to import')

        line_count = self._count_lines(upload.get_path())

        if self.is_aborted():
            task_status.abort('Aborted during preperation')

            log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

            return

        f = open(upload.get_path(), 'r')

        reader = CSVKitReader(f,
                              encoding=upload.encoding,
                              **upload.dialect_as_parameters())
        reader.next()

        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while True:
            # The row number which is about to be read, for error handling and indexing
            i += 1

            try:
                row = reader.next()
            except StopIteration:
                i -= 1
                break
            except UnicodeDecodeError:
                raise DataImportError(
                    'This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.'
                    % (upload.encoding, i))

            external_id = None

            if external_id_field_index is not None:
                external_id = row[external_id_field_index]

            data = utils.solr.make_data_row(dataset,
                                            row,
                                            data_upload=upload,
                                            external_id=external_id)
            data = data_typer(data, row)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update('%.0f%% complete (estimated)' %
                                   floor(float(i) / float(line_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        'Aborted after importing %.0f%% (estimated)' %
                        floor(float(i) / float(line_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' %
                                dataset_slug)

                    return

                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        f.close()

        task_status.update('100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import could not be completed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 14
0
    def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload
        
        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin('Preparing to import')

        book = load_workbook(upload.get_path(), use_iterators=True)
        sheet = book.get_active_sheet()
        row_count = sheet.get_highest_row()
        
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        for i, row in enumerate(sheet.iter_rows()):
            # Skip header
            if i == 0:
                continue

            values = []

            for c in row:
                value = c.internal_value

                if value.__class__ is datetime.datetime:
                    value = utils.xlsx.normalize_date(value)
                elif value.__class__ is float:
                    if value % 1 == 0:
                        value = int(value)

                if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                    value = value.isoformat()

                values.append(value)

            external_id = None

            if external_id_field_index is not None:
                external_id = values[external_id_field_index]

            data = utils.solr.make_data_row(dataset, values, data_upload=upload, external_id=external_id)
            data = data_typer(data, values)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)
                add_buffer = []

                task_status.update('%.0f%% complete' % floor(float(i) / float(row_count) * 100))

                if self.is_aborted():
                    task_status.abort('Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

                    return
                
                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update('100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i
        
        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 15
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset
        
        log = logging.getLogger(self.name)
        log.info('Beginning reindex, dataset_slug: %s' % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)

        task_status = dataset.current_task
        task_status.begin('Preparing to reindex')

        if self.is_aborted():
            task_status.abort('Aborted during preperation')

            log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

            return

        read_buffer = []
        add_buffer = []

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = 'dataset_slug: %s' % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i, sort='id asc')
                read_buffer = response['response']['docs']

            data = read_buffer.pop(0)
            row = json.loads(data['data'])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data['id'] = data['id'] 

            # Generate typed column data
            for n, c in enumerate(dataset.column_schema):
                if c['indexed']:
                    try:
                        value = coerce_type(row[n], TYPE_NAMES_MAPPING[c['type']])
                        new_data[c['indexed_name']] = value
                    except TypeCoercionError, e:
                        # TODO: log here
                        pass

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update('%.0f%% complete' % floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort('Aborted after reindexing %.0f%%' % floor(float(i) / float(dataset.row_count) * 100))

                    log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

                    return

            i += 1
Exemplo n.º 16
0
    def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload
        
        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)
        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin('Preparing to import')

        book = load_workbook(upload.get_path(), use_iterators=True)
        sheet = book.get_active_sheet()
        row_count = sheet.get_highest_row()
        
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        for i, row in enumerate(sheet.iter_rows()):
            # Skip header
            if i == 0:
                continue

            values = []

            for c in row:
                value = c.internal_value

                if value.__class__ is datetime.datetime:
                    value = utils.xlsx.normalize_date(value)
                elif value.__class__ is float:
                    if value % 1 == 0:
                        value = int(value)

                if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                    value = value.isoformat()

                values.append(value)

            external_id = None

            if external_id_field_index is not None:
                external_id = values[external_id_field_index]

            data = utils.solr.make_data_row(dataset, values, external_id=external_id)
            data = data_typer(data, values)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)
                add_buffer = []

                task_status.update('%.0f%% complete' % floor(float(i) / float(row_count) * 100))

                if self.is_aborted():
                    task_status.abort('Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

                    return
                
                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update('100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        dataset = Dataset.objects.get(slug=dataset_slug)

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i
        
        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 17
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset

        log = logging.getLogger(self.name)
        log.info('Beginning reindex, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Reindexing failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to reindex'))

        if self.is_aborted():
            task_status.abort(ugettext('Aborted during preparation'))

            log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = 'dataset_slug: %s' % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE,
                                      query,
                                      limit=SOLR_READ_BUFFER_SIZE,
                                      offset=i)
                read_buffer = response['response']['docs']

            data = read_buffer.pop(0)
            row = json.loads(data['data'])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data['id'] = data['id']
            new_data['data_upload_id'] = data['data_upload_id']
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update(
                    ugettext('%.0f%% complete') %
                    floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        ugettext('Aborted after reindexing %.0f%%') %
                        floor(float(i) / float(dataset.row_count) * 100))

                    log.warning('Reindex aborted, dataset_slug: %s' %
                                dataset_slug)

                    return

                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        dataset.column_schema = data_typer.schema
        dataset.save()

        log.info('Finished reindex, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 18
0
    def run(self,
            dataset_slug,
            upload_id,
            external_id_field_index=None,
            *args,
            **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload

        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to import'))

        book = xlrd.open_workbook(upload.get_path(), on_demand=True)
        sheet = book.sheet_by_index(0)
        row_count = sheet.nrows

        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        for i in range(1, row_count):
            values = sheet.row_values(i)
            types = sheet.row_types(i)

            normal_values = []

            for v, t in zip(values, types):
                if t == xlrd.biffh.XL_CELL_DATE:
                    v = utils.xls.normalize_date(v, book.datemode)
                elif t == xlrd.biffh.XL_CELL_NUMBER:
                    if v % 1 == 0:
                        v = int(v)

                normal_values.append(unicode(v))

            external_id = None

            if external_id_field_index is not None:
                external_id = values[external_id_field_index]

            data = utils.solr.make_data_row(dataset,
                                            normal_values,
                                            data_upload=upload,
                                            external_id=external_id)
            data = data_typer(data, normal_values)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)
                add_buffer = []

                task_status.update(
                    ugettext('%.0f%% complete') %
                    floor(float(i) / float(row_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        ugettext('Aborted after importing %.0f%%') %
                        floor(float(i) / float(row_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' %
                                dataset_slug)

                    return

                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import could not be completed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemplo n.º 19
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset

        log = logging.getLogger(self.name)
        log.info("Beginning reindex, dataset_slug: %s" % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)

        task_status = dataset.current_task
        task_status.begin("Preparing to reindex")

        if self.is_aborted():
            task_status.abort("Aborted during preparation")

            log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value("PERF", "TASK_THROTTLE")

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = "dataset_slug: %s" % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i)
                read_buffer = response["response"]["docs"]

            data = read_buffer.pop(0)
            row = json.loads(data["data"])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data["id"] = data["id"]
            new_data["data_upload_id"] = data["data_upload_id"]
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update("%.0f%% complete" % floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        "Aborted after reindexing %.0f%%" % floor(float(i) / float(dataset.row_count) * 100)
                    )

                    log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug)

                    return

                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update("100% complete")

        # Refresh dataset
        dataset = Dataset.objects.get(slug=dataset_slug)
        dataset.column_schema = data_typer.schema
        dataset.save()

        log.info("Finished reindex, dataset_slug: %s" % dataset_slug)

        return data_typer
Exemplo n.º 20
0
    def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload
        
        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)
        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        self.task_start(task_status, 'Preparing to import')

        book = xlrd.open_workbook(upload.get_path(), on_demand=True)
        sheet = book.sheet_by_index(0)
        row_count = sheet.nrows
        
        add_buffer = []

        for i in range(1, row_count):
            values = sheet.row_values(i)
            types = sheet.row_types(i)

            normal_values = []

            for v, t in zip(values, types):
                if t == xlrd.biffh.XL_CELL_DATE:
                    v = utils.xls.normalize_date(v, book.datemode)
                elif t == xlrd.biffh.XL_CELL_NUMBER:
                    if v % 1 == 0:
                        v = int(v)

                normal_values.append(unicode(v))

            external_id = None

            if external_id_field_index is not None:
                external_id = values[external_id_field_index]

            data = utils.solr.make_data_row(dataset, values, external_id=external_id)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)
                add_buffer = []

                task_status.message = '%.0f%% complete' % floor(float(i) / float(row_count) * 100)
                task_status.save()

                if self.is_aborted():
                    self.task_abort(self.task_status, 'Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

                    return

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        self.task_update(task_status, '100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        dataset = Dataset.objects.get(slug=dataset_slug)

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)