def _create_work_batches(cls, subTask, rawPieces, priority): key_gen, order_by = cls._get_key_gen(subTask.batchingMode, subTask.maxPageSize) loads = cls._group_into_batches(rawPieces, key_gen, order_by) # create batches batches = [] for key, batch_load in loads.iteritems(): if isinstance(key, tuple): name = key[0] else: name = key b = m.Batch(taskId=subTask.taskId, subTaskId=subTask.subTaskId, priority=priority, name=name) for pageIndex, page_load in enumerate( split_by_size(batch_load, subTask.maxPageSize)): p = m.Page(pageIndex=pageIndex) b.pages.append(p) for memberIndex, rawPiece in enumerate(page_load): memberEntry = m.PageMemberEntry(memberIndex=memberIndex) memberEntry.rawPieceId = rawPiece.rawPieceId p.memberEntries.append(memberEntry) batches.append(b) return batches
def route(batch_router, raw_pieces, priority=5): # get routed sub tasks sub_tasks = [s.sub_task for s in batch_router.sub_tasks] assert sub_tasks # check consistent batching mode and get page size batching_mode = sub_tasks[0].batchingMode page_size = None # this will only be used for None batching mode for sub_task in sub_tasks: if sub_task.batchingMode != batching_mode: raise ValueError("inconsistent batching modes") if page_size is None: page_size = sub_task.maxPageSize else: page_size = min(page_size, sub_task.maxPageSize) key_gen, order_by = _batcher._get_key_gen(batching_mode, page_size) loads = _batcher._group_into_batches(raw_pieces, key_gen, order_by) batches = [] for key, batch_load in loads.iteritems(): if isinstance(key, tuple): name = key[0] else: name = key sub_task = batch_router.get_routed_sub_task(key) b = m.Batch(taskId=sub_task.taskId, subTaskId=sub_task.subTaskId, priority=priority, name=name) for pageIndex, page_load in enumerate( split_by_size(batch_load, sub_task.maxPageSize)): p = m.Page(pageIndex=pageIndex) b.pages.append(p) for memberIndex, rawPiece in enumerate(page_load): memberEntry = m.PageMemberEntry(memberIndex=memberIndex) memberEntry.rawPieceId = rawPiece.rawPieceId p.memberEntries.append(memberEntry) batches.append(b) return batches
def create_qa_batches(self, qaSubTask, userId, intervalId, samples, priority=5): if not samples: return for load in self.paginate(samples, qaSubTask.maxPageSize): b = m.Batch(taskId=qaSubTask.taskId, subTaskId=qaSubTask.subTaskId, notUserId=userId, workIntervalId=intervalId, priority=priority) p = m.Page(pageIndex=0) b.pages.append(p) for memberIndex, workEntryId in enumerate(load): memberEntry = m.PageMemberEntry(memberIndex=memberIndex) memberEntry.workEntryId = workEntryId p.memberEntries.append(memberEntry) SS.add(b) SS.flush()
def _create_rework_batches(subTask, rawPieceIds, priority): key_gen = lambda i, x: (None, i / subTask.maxPageSize) loads = OrderedDict() for i, rawPieceId in enumerate(rawPieceIds): loads.setdefault(key_gen(i, None), []).append(rawPieceId) batches = [] for batch_load in loads.values(): b = m.Batch(taskId=subTask.taskId, subTaskId=subTask.subTaskId, priority=priority) for pageIndex, page_load in enumerate( split_by_size(batch_load, subTask.maxPageSize)): p = m.Page(pageIndex=pageIndex) b.pages.append(p) for memberIndex, rawPieceId in enumerate(page_load): memberEntry = m.PageMemberEntry(memberIndex=memberIndex) memberEntry.rawPieceId = rawPieceId p.memberEntries.append(memberEntry) batches.append(b) return batches
def load_qa_failed(task): print 'checking task %s' % task.taskId subTaskById = dict([(s.subTaskId, s) for s in task.subTasks]) work_plan = {} for s in task.subTasks: if s.qaConfig and s.qaConfig.populateRework: qaSubTask = subTaskById[s.qaConfig.qaSubTaskId] reworkSubTask = subTaskById[s.qaConfig.reworkSubTaskId] rec = work_plan.setdefault(s.qaConfig.qaSubTaskId, {}) if rec == {}: rec['qaSubTask'] = qaSubTask rec[s.subTaskId] = { 'src': s, 'dest': reworkSubTask, 'threshold': s.qaConfig.accuracyThreshold } subTaskIds = [ s.subTaskId for s in SS.query(m.SubTask.subTaskId).join( m.WorkType, m.SubTask.workTypeId == m.WorkType.workTypeId).filter( m.WorkType.modifiesTranscription).filter( m.Task.taskId == task.taskId) ] q_latest = m.WorkEntry.query.filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.subTaskId.in_(subTaskIds)).distinct( m.WorkEntry.rawPieceId).order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) currently_batched = {} for r in m.PageMember.query.filter( m.PageMember.taskId == task.taskId).filter( m.PageMember.workType == m.WorkType.REWORK).all(): currently_batched.setdefault(r.subTaskId, set()).add(r.rawPieceId) group_plan = {} for entry in q_latest.all(): log.debug('checking raw piece {}, entry {}'.format( entry.rawPieceId, entry.entryId)) qa_record = m.QaTypeEntry.query.filter( m.QaTypeEntry.qaedEntryId == entry.entryId).distinct( m.QaTypeEntry.qaedEntryId).order_by( m.QaTypeEntry.qaedEntryId, m.QaTypeEntry.created.desc()).all() if qa_record: qa_record = qa_record[0] else: log.debug('entry {} not qaed'.format(entry.entryId)) continue rec = work_plan.get(qa_record.subTaskId, None) if not rec: log.debug('no auto-population configure for qa sub task {}'.format( qa_record.subTaskId)) continue if entry.subTaskId not in rec: log.debug('sub task not configured for auto-population: {}'.format( entry.subTaskId)) continue cfg = rec[entry.subTaskId] if qa_record.qaScore >= cfg['threshold']: log.debug('qa score passed') continue reworkSubTask = cfg['dest'] if entry.rawPieceId in currently_batched.setdefault( reworkSubTask.subTaskId, set()): log.debug('already batched in sub task {}'.format( reworkSubTask.subTaskId)) continue group_plan.setdefault(cfg['dest'], {}).setdefault(entry.userId, []).append(entry) #print 'group_plan', group_plan for reworkSubTask, workEntriesByUserId in group_plan.iteritems(): for userId, entries in workEntriesByUserId.iteritems(): batch = m.Batch(taskId=task.taskId, subTaskId=reworkSubTask.subTaskId, name='qa_failed_of_user#%s' % userId, priority=5) p = m.Page(pageIndex=0) batch.pages.append(p) for memberIndex, qaedEntry in enumerate(entries): memberEntry = m.PageMemberEntry(memberIndex=memberIndex) memberEntry.rawPieceId = qaedEntry.rawPieceId p.memberEntries.append(memberEntry) SS.add(batch)