def add_row(self, user, data, external_id=None): """ Add (or overwrite) a row to this dataset. """ self.lock() try: data_typer = DataTyper(self.column_schema) solr_row = utils.solr.make_data_row(self, data, external_id=external_id) solr_row = data_typer(solr_row, data) solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True) self.schema = data_typer.schema if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: self.sample_data.append(data) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) self.last_modified = datetime.utcnow() self.last_modified_by = user self.last_modification = "1 row %s" % ("added" if added else "updated") self.save() return solr_row finally: self.unlock()
def add_many_rows(self, user, data): """ Shortcut for adding rows in bulk. ``data`` must be an array of tuples in the format (data_array, external_id) """ solr_rows = [utils.solr.make_data_row(self, d[0], external_id=d[1]) for d in data] solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True) if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: needed = 5 - len(self.sample_data) self.sample_data.extend([d[0] for d in data[:needed]]) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) updated = len(data) - added self.last_modified = datetime.utcnow() self.last_modified_by = user if added and updated: self.last_modification = '%i rows added and %i updated' % (added, updated) elif added: self.last_modification = '%i rows added' % added else: self.last_modification = '%i rows updated' % updated self.save() return solr_rows
def add_many_rows(self, user, data): """ Shortcut for adding rows in bulk. ``data`` must be an array of tuples in the format (data_array, external_id) """ self.lock() try: data_typer = DataTyper(self.column_schema) solr_rows = [ utils.solr.make_data_row(self, d[0], external_id=d[1]) for d in data ] solr_rows = [data_typer(s, d[0]) for s, d in zip(solr_rows, data)] solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True) self.schema = data_typer.schema if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: needed = 5 - len(self.sample_data) self.sample_data.extend([d[0] for d in data[:needed]]) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) updated = len(data) - added self.last_modified = now() self.last_modified_by = user if added and updated: self.last_modification = _('%(added)i rows added and %(updated)i updated') \ % {'added': added, 'updated': updated} elif added: self.last_modification = _('%i rows added') % added else: self.last_modification = _('%i rows updated') % updated self.save() return solr_rows finally: self.unlock()
def add_many_rows(self, user, data): """ Shortcut for adding rows in bulk. ``data`` must be an array of tuples in the format (data_array, external_id) """ self.lock() try: data_typer = DataTyper(self.column_schema) solr_rows = [utils.solr.make_data_row(self, d[0], external_id=d[1]) for d in data] solr_rows = [data_typer(s, d[0]) for s, d in zip(solr_rows, data)] solr.add(settings.SOLR_DATA_CORE, solr_rows, commit=True) self.schema = data_typer.schema if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: needed = 5 - len(self.sample_data) self.sample_data.extend([d[0] for d in data[:needed]]) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) updated = len(data) - added self.last_modified = now() self.last_modified_by = user if added and updated: self.last_modification = _('%(added)i rows added and %(updated)i updated') \ % {'added': added, 'updated': updated} elif added: self.last_modification = _('%i rows added') % added else: self.last_modification = _('%i rows updated') % updated self.save() return solr_rows finally: self.unlock()
def update_full_text(self, commit=True): """ Update the full-text search metadata for this dataset stored in Solr. """ category_ids = [] full_text_data = [ self.name, self.description, "%s %s" % (self.creator.first_name, self.creator.last_name), self.creator.email, ] for category in self.categories.all(): category_ids.append(category.id) full_text_data.append(category.name) if not category_ids: category_ids.append(settings.PANDA_UNCATEGORIZED_ID) full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME) for data_upload in self.data_uploads.all(): full_text_data.append(data_upload.original_filename) for related_upload in self.related_uploads.all(): full_text_data.append(related_upload.original_filename) if self.column_schema is not None: full_text_data.extend([c["name"] for c in self.column_schema]) full_text = "\n".join(full_text_data) solr.add( settings.SOLR_DATASETS_CORE, [ { "slug": self.slug, "creation_date": self.creation_date.isoformat() + "Z", "categories": category_ids, "full_text": full_text, } ], commit=commit, )
def update_full_text(self, commit=True): """ Update the full-text search metadata for this dataset stored in Solr. """ category_ids = [] full_text_data = [ unquote(self.name), unquote(self.description), '%s %s' % (self.creator.first_name, self.creator.last_name), self.creator.email ] for category in self.categories.all(): category_ids.append(category.id) full_text_data.append(category.name) if not category_ids: category_ids.append(settings.PANDA_UNCATEGORIZED_ID) full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME) for data_upload in self.data_uploads.all(): full_text_data.append(data_upload.original_filename) for related_upload in self.related_uploads.all(): full_text_data.append(related_upload.original_filename) if self.column_schema is not None: full_text_data.extend([c['name'] for c in self.column_schema]) full_text = u'\n'.join(map( unicode, full_text_data)) # convert any i18n proxies into strings solr.add(settings.SOLR_DATASETS_CORE, [{ 'slug': self.slug, 'creation_date': self.creation_date.isoformat() + 'Z', 'categories': category_ids, 'full_text': full_text }], commit=commit)
def update_full_text(self, commit=True): """ Update the full-text search metadata for this dataset stored in Solr. """ category_ids = [] full_text_data = [ unquote(self.name), unquote(self.description), '%s %s' % (self.creator.first_name, self.creator.last_name), self.creator.email ] for category in self.categories.all(): category_ids.append(category.id) full_text_data.append(category.name) if not category_ids: category_ids.append(settings.PANDA_UNCATEGORIZED_ID) full_text_data.append(settings.PANDA_UNCATEGORIZED_NAME) for data_upload in self.data_uploads.all(): full_text_data.append(data_upload.original_filename) for related_upload in self.related_uploads.all(): full_text_data.append(related_upload.original_filename) if self.column_schema is not None: full_text_data.extend([c['name'] for c in self.column_schema]) full_text = '\n'.join(full_text_data) print full_text solr.add(settings.SOLR_DATASETS_CORE, [{ 'slug': self.slug, 'creation_date': self.creation_date.isoformat() + 'Z', 'categories': category_ids, 'full_text': full_text }], commit=commit)
def add_row(self, user, data, external_id=None): """ Add (or overwrite) a row to this dataset. """ solr_row = utils.solr.make_data_row(self, data, external_id=external_id) solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True) if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: self.sample_data.append(data) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) self.last_modified = datetime.utcnow() self.last_modified_by = user self.last_modification = '1 row %s' % ('added' if added else 'updated') self.save() return solr_row
def add_row(self, user, data, external_id=None): """ Add (or overwrite) a row to this dataset. """ self.lock() try: data_typer = DataTyper(self.column_schema) solr_row = utils.solr.make_data_row(self, data, external_id=external_id) solr_row = data_typer(solr_row, data) solr.add(settings.SOLR_DATA_CORE, [solr_row], commit=True) self.schema = data_typer.schema if not self.sample_data: self.sample_data = [] if len(self.sample_data) < 5: self.sample_data.append(data) old_row_count = self.row_count self.row_count = self._count_rows() added = self.row_count - (old_row_count or 0) self.last_modified = now() self.last_modified_by = user self.last_modification = _('1 row %s') % ('added' if added else 'updated') self.save() return solr_row finally: self.unlock()
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin(ugettext('Preparing to import')) book = xlrd.open_workbook(upload.get_path(), on_demand=True) sheet = book.sheet_by_index(0) row_count = sheet.nrows add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') for i in range(1, row_count): values = sheet.row_values(i) types = sheet.row_types(i) normal_values = [] for v, t in zip(values, types): if t == xlrd.biffh.XL_CELL_DATE: v = utils.xls.normalize_date(v, book.datemode) elif t == xlrd.biffh.XL_CELL_NUMBER: if v % 1 == 0: v = int(v) normal_values.append(unicode(v)) external_id = None if external_id_field_index is not None: external_id = values[external_id_field_index] data = utils.solr.make_data_row(dataset, normal_values, data_upload=upload, external_id=external_id) data = data_typer(data, normal_values) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(row_count) * 100)) if self.is_aborted(): task_status.abort(ugettext('Aborted after importing %.0f%%') % floor(float(i) / float(row_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset from database so there is no chance of crushing changes made since the task started try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin('Preparing to import') line_count = self._count_lines(upload.get_path()) if self.is_aborted(): task_status.abort('Aborted during preperation') log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return f = open(upload.get_path(), 'r') reader = CSVKitReader(f, encoding=upload.encoding, **upload.dialect_as_parameters()) reader.next() add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while True: # The row number which is about to be read, for error handling and indexing i += 1 try: row = reader.next() except StopIteration: i -= 1 break except UnicodeDecodeError: raise DataImportError('This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.' % (upload.encoding, i)) external_id = None if external_id_field_index is not None: external_id = row[external_id_field_index] data = utils.solr.make_data_row(dataset, row, external_id=external_id) data = data_typer(data, row) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) f.close() task_status.update('100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started dataset = Dataset.objects.get(slug=dataset_slug) if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning reindex, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return task_status = dataset.current_task task_status.begin(ugettext('Preparing to reindex')) if self.is_aborted(): task_status.abort(ugettext('Aborted during preparation')) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while i < dataset.row_count: if not read_buffer: query = 'dataset_slug: %s' % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response['response']['docs'] data = read_buffer.pop(0) row = json.loads(data['data']) new_data = utils.solr.make_data_row(dataset, row) new_data['id'] = data['id'] new_data['data_upload_id'] = data['data_upload_id'] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort(ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100)) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return dataset.column_schema = data_typer.schema dataset.save() log.info('Finished reindex, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin('Preparing to import') line_count = self._count_lines(upload.get_path()) if self.is_aborted(): task_status.abort('Aborted during preperation') log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return f = open(upload.get_path(), 'r') reader = CSVKitReader(f, encoding=upload.encoding, **upload.dialect_as_parameters()) reader.next() add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while True: # The row number which is about to be read, for error handling and indexing i += 1 try: row = reader.next() except StopIteration: i -= 1 break except UnicodeDecodeError: raise DataImportError( 'This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.' % (upload.encoding, i)) external_id = None if external_id_field_index is not None: external_id = row[external_id_field_index] data = utils.solr.make_data_row(dataset, row, data_upload=upload, external_id=external_id) data = data_typer(data, row) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100)) if self.is_aborted(): task_status.abort( 'Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) f.close() task_status.update('100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin('Preparing to import') book = load_workbook(upload.get_path(), use_iterators=True) sheet = book.get_active_sheet() row_count = sheet.get_highest_row() add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') for i, row in enumerate(sheet.iter_rows()): # Skip header if i == 0: continue values = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: value = utils.xlsx.normalize_date(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() values.append(value) external_id = None if external_id_field_index is not None: external_id = values[external_id_field_index] data = utils.solr.make_data_row(dataset, values, data_upload=upload, external_id=external_id) data = data_typer(data, values) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete' % floor(float(i) / float(row_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update('100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning reindex, dataset_slug: %s' % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) task_status = dataset.current_task task_status.begin('Preparing to reindex') if self.is_aborted(): task_status.abort('Aborted during preperation') log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return read_buffer = [] add_buffer = [] i = 0 while i < dataset.row_count: if not read_buffer: query = 'dataset_slug: %s' % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i, sort='id asc') read_buffer = response['response']['docs'] data = read_buffer.pop(0) row = json.loads(data['data']) new_data = utils.solr.make_data_row(dataset, row) new_data['id'] = data['id'] # Generate typed column data for n, c in enumerate(dataset.column_schema): if c['indexed']: try: value = coerce_type(row[n], TYPE_NAMES_MAPPING[c['type']]) new_data[c['indexed_name']] = value except TypeCoercionError, e: # TODO: log here pass add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete' % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after reindexing %.0f%%' % floor(float(i) / float(dataset.row_count) * 100)) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return i += 1
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin('Preparing to import') book = load_workbook(upload.get_path(), use_iterators=True) sheet = book.get_active_sheet() row_count = sheet.get_highest_row() add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') for i, row in enumerate(sheet.iter_rows()): # Skip header if i == 0: continue values = [] for c in row: value = c.internal_value if value.__class__ is datetime.datetime: value = utils.xlsx.normalize_date(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() values.append(value) external_id = None if external_id_field_index is not None: external_id = values[external_id_field_index] data = utils.solr.make_data_row(dataset, values, external_id=external_id) data = data_typer(data, values) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update('%.0f%% complete' % floor(float(i) / float(row_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update('100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started dataset = Dataset.objects.get(slug=dataset_slug) if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning reindex, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return task_status = dataset.current_task task_status.begin(ugettext('Preparing to reindex')) if self.is_aborted(): task_status.abort(ugettext('Aborted during preparation')) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') i = 0 while i < dataset.row_count: if not read_buffer: query = 'dataset_slug: %s' % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response['response']['docs'] data = read_buffer.pop(0) row = json.loads(data['data']) new_data = utils.solr.make_data_row(dataset, row) new_data['id'] = data['id'] new_data['data_upload_id'] = data['data_upload_id'] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update( ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort( ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100)) log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return dataset.column_schema = data_typer.schema dataset.save() log.info('Finished reindex, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task task_status.begin(ugettext('Preparing to import')) book = xlrd.open_workbook(upload.get_path(), on_demand=True) sheet = book.sheet_by_index(0) row_count = sheet.nrows add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value('PERF', 'TASK_THROTTLE') for i in range(1, row_count): values = sheet.row_values(i) types = sheet.row_types(i) normal_values = [] for v, t in zip(values, types): if t == xlrd.biffh.XL_CELL_DATE: v = utils.xls.normalize_date(v, book.datemode) elif t == xlrd.biffh.XL_CELL_NUMBER: if v % 1 == 0: v = int(v) normal_values.append(unicode(v)) external_id = None if external_id_field_index is not None: external_id = values[external_id_field_index] data = utils.solr.make_data_row(dataset, normal_values, data_upload=upload, external_id=external_id) data = data_typer(data, normal_values) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update( ugettext('%.0f%% complete') % floor(float(i) / float(row_count) * 100)) if self.is_aborted(): task_status.abort( ugettext('Aborted after importing %.0f%%') % floor(float(i) / float(row_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return time.sleep(throttle) if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update(ugettext('100% complete')) # Refresh dataset from database so there is no chance of crushing changes made since the task started try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Import could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug) return if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.column_schema = data_typer.schema dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug) return data_typer
def run(self, dataset_slug, *args, **kwargs): """ Execute reindex. """ from panda.models import Dataset log = logging.getLogger(self.name) log.info("Beginning reindex, dataset_slug: %s" % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) task_status = dataset.current_task task_status.begin("Preparing to reindex") if self.is_aborted(): task_status.abort("Aborted during preparation") log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug) return read_buffer = [] add_buffer = [] data_typer = DataTyper(dataset.column_schema) throttle = config_value("PERF", "TASK_THROTTLE") i = 0 while i < dataset.row_count: if not read_buffer: query = "dataset_slug: %s" % (dataset.slug) response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i) read_buffer = response["response"]["docs"] data = read_buffer.pop(0) row = json.loads(data["data"]) new_data = utils.solr.make_data_row(dataset, row) new_data["id"] = data["id"] new_data["data_upload_id"] = data["data_upload_id"] new_data = data_typer(new_data, row) add_buffer.append(new_data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.update("%.0f%% complete" % floor(float(i) / float(dataset.row_count) * 100)) if self.is_aborted(): task_status.abort( "Aborted after reindexing %.0f%%" % floor(float(i) / float(dataset.row_count) * 100) ) log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug) return time.sleep(throttle) i += 1 if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) task_status.update("100% complete") # Refresh dataset dataset = Dataset.objects.get(slug=dataset_slug) dataset.column_schema = data_typer.schema dataset.save() log.info("Finished reindex, dataset_slug: %s" % dataset_slug) return data_typer
def run(self, dataset_slug, upload_id, external_id_field_index=None, *args, **kwargs): """ Execute import. """ from panda.models import Dataset, DataUpload log = logging.getLogger(self.name) log.info('Beginning import, dataset_slug: %s' % dataset_slug) dataset = Dataset.objects.get(slug=dataset_slug) upload = DataUpload.objects.get(id=upload_id) task_status = dataset.current_task self.task_start(task_status, 'Preparing to import') book = xlrd.open_workbook(upload.get_path(), on_demand=True) sheet = book.sheet_by_index(0) row_count = sheet.nrows add_buffer = [] for i in range(1, row_count): values = sheet.row_values(i) types = sheet.row_types(i) normal_values = [] for v, t in zip(values, types): if t == xlrd.biffh.XL_CELL_DATE: v = utils.xls.normalize_date(v, book.datemode) elif t == xlrd.biffh.XL_CELL_NUMBER: if v % 1 == 0: v = int(v) normal_values.append(unicode(v)) external_id = None if external_id_field_index is not None: external_id = values[external_id_field_index] data = utils.solr.make_data_row(dataset, values, external_id=external_id) add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] task_status.message = '%.0f%% complete' % floor(float(i) / float(row_count) * 100) task_status.save() if self.is_aborted(): self.task_abort(self.task_status, 'Aborted after importing %.0f%%' % floor(float(i) / float(row_count) * 100)) log.warning('Import aborted, dataset_slug: %s' % dataset_slug) return if add_buffer: solr.add(settings.SOLR_DATA_CORE, add_buffer) add_buffer = [] solr.commit(settings.SOLR_DATA_CORE) self.task_update(task_status, '100% complete') # Refresh dataset from database so there is no chance of crushing changes made since the task started dataset = Dataset.objects.get(slug=dataset_slug) if not dataset.row_count: dataset.row_count = i else: dataset.row_count += i dataset.save() # Refres upload = DataUpload.objects.get(id=upload_id) upload.imported = True upload.save() log.info('Finished import, dataset_slug: %s' % dataset_slug)