def test_async_exporter(self): exp = AsyncNdbExporter(2) for i in range(self.expected): instance = UnitTestModel() instance.foo = str(i) exp.append(instance) exp.flush() result = UnitTestModel.query().fetch() self.assertEqual(len(result), self.expected)
def test_sharded_student_importer(self): dummy_count = self.config.estimated_student_count self.write_stub_student_file(dummy_count) exp = AsyncNdbExporter(dummy_count) # Prep work: load student stubs to database with sharding enabled imp = StubCsvImporter(self.config, use_shards=True) for stu in imp.generate_single(): exp.append(stu) exp.flush() self.assertEqual(dummy_count, len(Student.get_all_id_numbers())) imp = ShardedStudentImporter() self.check_importer_count(dummy_count, imp, multi=True) imp = ShardedStudentImporter() self.check_importer_count(dummy_count, imp, multi=False)
def load_students_to_db(config): """ Students are saved to the DB as we go -- we load a batch from the input file, convert them to students, perform cleanup, then pass them to the exporter. Any students that already existed in the DB when the task started are ignored -- this allows us to make forward progress even if the task times out during import. There's a minor chance of data leakage if the task terminates while the unique visitors are still processing a student. That won't cause issues though -- when the job resumes, that student will receive a fresh unique ID and the "zombie" record will simply be an unused number with no database footprint. """ if config.estimated_student_count == STUDENT_COUNT_NOT_INITIALIZED: logging.info("Initializing student count") initialize_student_count(config) importer = ImporterFactory.get(config) exporter = AsyncNdbExporter.get_or_create(config) visitors = VisitorFactory.get_cleanup_visitors(config) existing_sids = Student.get_all_id_numbers() if len(existing_sids) > 0: logging.info("{} students already loaded, the importer will skip those records".format( len(existing_sids))) for student_batch in importer.generate_multi(): # First, check if the student is already in the datastore (this means they've # already been cleaned up and are ready for export). # Then load their data from the stored cache (creating a fresh csd object if # this is a new student). students_not_in_db = [stu for stu in student_batch if stu.sid not in existing_sids] cached_student_data = CachedStudentData.get_or_create_multi( [stu.sid for stu in students_not_in_db]) for student in students_not_in_db: student.copy_from_cache(cached_student_data[student.sid]) # Now perform data cleanup and (when done) save the students to DB for i in visitors: i.accept_multi(students_not_in_db) for student in [stu for stu in students_not_in_db if stu.cache_is_dirty]: cache = cached_student_data[student.sid] student.copy_to_cache(cache) exporter.append(cache) for student in students_not_in_db: student.cleanup_complete = True exporter.append(student) # Manual flush after each batch instead of at the end of the run; the # uniquifier will also be dumping entities into the exporter, so we need # to make sure that everything's been committed before starting a fresh # batch (those entities are only cached within the scope of an # accept_multi run) exporter.flush() for i in visitors: i.close() logging.info("Filter results: {} accepted, {} rejected".format( importer.import_count(), importer.reject_count())) logging.info("All students loaded to database")
def test_multi_exporter(self): exp = AsyncNdbExporter(2) for i in range(self.expected): instance = UnitTestModel() exp.append(instance) instance = AnotherUnitTestModel() exp.append(instance) exp.flush() result1 = UnitTestModel.query().fetch() result2 = AnotherUnitTestModel.query().fetch() actual_count = len(result1) + len(result2) expected_count = self.expected * 2 self.assertEqual(actual_count, expected_count)
def __init__(self, config): super(UniqueStringVisitor, self).__init__(config) self.case_sensitive = config.unique_strings_case_sensitive self.suppress_first = getattr(config, self.suppress_first_label()) self.exporter = AsyncNdbExporter.get_or_create(config) self.kind = self.student_label()