def get_qa_samples(self, subTask, userId, entryIds): population = len(entryIds) sampling_error = subTask.qaConfig.samplingError estimated_accuracy = subTask.qaConfig.defaultExpectedAccuracy confidence_interval = subTask.qaConfig.confidenceInterval samples_needed = self.get_sample_set_size(population, sampling_error, estimated_accuracy, confidence_interval) # entries which QA has been planned q_planned = SS.query(m.PageMember.workEntryId ).filter(m.PageMember.taskId==subTask.taskId ).filter(m.PageMember.workType==m.WorkType.QA ).distinct(m.PageMember.workEntryId) # entries that have been QA already q_qaed = SS.query(m.WorkEntry.qaedEntryId ).filter(m.WorkEntry.taskId==subTask.taskId ).distinct(m.WorkEntry.qaedEntryId) all_planned = set([i.workEntryId for i in q_planned.all()]) all_qaed = set([i.qaedEntryId for i in q_qaed.all()]) planned = all_planned & entryIds qaed = all_qaed & entryIds to_add = samples_needed - len(qaed) - len(planned) if to_add <= 0: return [] sample_pool = list(entryIds - planned - qaed) random.shuffle(sample_pool) return sample_pool[:to_add]
def filter_qa_error_type(task, errorTypeId): try: errorTypeId = int(erorrTypeId) except: raise ValueError(_('invalid error type id: {}').format(errorTypeId)) taskErrorType = m.TaskErrorType.query.get((task.taskId, errorTypeId)) if not taskErrorType: return set() # latest QA result inner = SS.query( m.WorkEntry.entryId, m.WorkEntry.qaedEntryId, m.WorkEntry.rawPieceId).distinct(m.WorkEntry.qaedEntryId).filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.workType == m.WorkType.QA).order_by( m.WorkEntry.qaedEntryId, m.WorkEntry.created.desc()) sub_q = inner.subquery('sub_q') q = SS.query(sub_q.c.rawPieceId).distinct(sub_q.c.rawPieceId).join( m.AppliedError, m.AppliedError.entryId == sub_q.c.entryId).filter( m.AppliedError.errorTypeId == errorTypeId) return set([r.rawPieceId for r in q.all()])
def filter_user(task, workOption, userId): try: userId = int(userId) except: raise ValueError(_('invalid user id: {}').format(userId)) # TODO: check user is working on this task? inner = SS.query( m.WorkEntry.rawPieceId.label('rawPieceId'), m.WorkEntry.userId.label('userId')).distinct( m.WorkEntry.rawPieceId).filter(m.WorkEntry.taskId == task.taskId) if workOption == MyFilter.ANY: inner = inner.filter(m.WorkEntry.userId == userId) elif workOption == MyFilter.FIRST: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created) elif workOption == MyFilter.MOST_RECENT: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) elif workOption == MyFilter.MOST_RECENT_MODIFIED: inner = inner.filter( m.WorkEntry.modifiesTranscription.is_(True)).order_by( m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) else: raise ValueError(_('invalid work option: {}').format(workOption)) sub_q = inner.subquery('sub_q') q = SS.query(sub_q.c.rawPieceId).distinct( sub_q.c.rawPieceId).filter(sub_q.c.userId == userId) return set([r.rawPieceId for r in q.all()])
def filter_unused(): return SS.query(m.TaskWorker.userId.distinct() ).filter(m.TaskWorker.removed==False ).filter(m.TaskWorker.taskId.in_( SS.query(m.Task.taskId ).filter_by(taskType=m.TaskType.TRANSLATION ).filter(m.Task.status.in_([ m.Task.STATUS_ACTIVE, m.Task.STATUS_DISABLED]) ) ) ).all()
def filter_qa_severity(task, isMoreThan, score, isCorrect): try: assert isMoreThan in (MyFilter.TRUE, MyFilter.FALSE) except: raise ValueError(_('invalid option value: {}').format(isMoreThan)) else: isMoreThan = isMoreThan == MyFilter.TRUE try: assert isCorrect in (MyFilter.TRUE, MyFilter.FALSE) except: raise ValueError(_('invalid option value: {}').format(isCorrect)) else: isCorrect = isCorrect == MyFilter.TRUE try: score = float(score) except: raise ValueError(_('invalid score value: {}').format(score)) if isMoreThan: if isCorrect: predicate = lambda qaErrorSum: qaErrorSum == None or ( 1 - qaErrorSum) > score else: predicate = lambda qaErrorSum: qaErrorSum > score else: if correct: predicate = lambda qaErrorSum: 1 - (qaErrorSum or 0) < score else: predicate = lambda qaErrorSum: qaErrorSum == None or qaErrorSum < score # latest QA result q1 = SS.query(m.WorkEntry.entryId, m.WorkEntry.qaedEntryId, m.WorkEntry.rawPieceId).distinct( m.WorkEntry.qaedEntryId).filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.workType == m.WorkType.QA).order_by( m.WorkEntry.qaedEntryId, m.WorkEntry.created.desc()) sub_q = q1.subquery('sub_q') stmt = SS.query(m.AppliedError.entryId, func.sum( m.AppliedError.severity).label('qaErrorSum')).group_by( m.AppliedError.entryId).subquery() q = SS.query(sub_q.c.rawPieceId, stmt.c.qaErrorSum).join(stmt, stmt.c.entryId == sub_q.c.entryId) return set([r.rawPieceId for r in q.all() if predicate(r.qaErrorSum)])
def filter_word_count(task, wordCountOption, words): try: words = int(words) except: raise ValueError(_('invalid words: {}').format(words)) def count_words(t): extractText = Converter.asExtract(t) return len(extractText.split()) if wordCountOption == MyFilter.EQUALS: func_ok = lambda (t): count_words(t) == words elif wordCountOption == MyFilter.GREATER_THAN: func_ok = lambda (t): count_words(t) > words elif wordCountOption == MyFilter.LESS_THAN: func_ok = lambda (t): count_words(t) < words q = SS.query(m.WorkEntry.rawPieceId, m.WorkEntry.result).distinct(m.WorkEntry.rawPieceId).filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.modifiesTranscription).order_by( m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) return set([r.rawPieceId for r in q.all() if func_ok(r.result)])
def filter_sub_task_work(task, workOption, subTaskId): try: subTaskId = int(subTaskId) except: raise ValueError(_('invalid sub task id: {}').format(subTaskId)) subTask = m.SubTask.query.get(subTaskId) if not subTask or subTask.taskId != task.taskId: return set() inner = SS.query(m.WorkEntry.rawPieceId, m.WorkEntry.subTaskId).distinct( m.WorkEntry.rawPieceId).filter(m.WorkEntry.taskId == task.taskId) if workOption == MyFilter.ANY: inner = inner.filter(m.WorkEntry.subTaskId == subTaskId) elif workOption == MyFilter.FIRST: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created) elif workOption == MyFilter.MOST_RECENT: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) elif workOption == MyFilter.MOST_RECENT_MODIFIED: inner = inner.filter( m.WorkEntry.modifiesTranscription.is_(True)).order_by( m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) sub_q = inner.subquery('sub_q') sel_stmt = select([sub_q.c.rawPieceId], distinct=True, from_obj=sub_q).where(sub_q.c.subTaskId == subTaskId) return set([r.rawPieceId for r in SS.bind.execute(sel_stmt)])
def check_get_policy(subTask, user): if subTask.getPolicy == m.SubTask.POLICY_NO_LIMIT: return None elif subTask.getPolicy == m.SubTask.POLICY_ONE_ONLY: # check if user has submitted any batch q = SS.query(m.WorkEntry.batchId.distinct()).filter( m.WorkEntry.subTaskId == subTask.subTaskId).filter( m.WorkEntry.userId == user.userId).filter( m.WorkEntry.batchId.notin_( SS.query(m.Batch.batchId).filter( m.Batch.subTaskId == subTask.subTaskId))) if q.count() > 0: return _('user has done work on this sub task before').format() # return _('unknown policy \'{0}\' of sub task {1}' # ).format(subTask.getPolicy, subTask.subTaskId) return None
def select(selection): # TODO: implemet this taskId = getattr(selection, 'taskId') task = m.Task.query.get(taskId) if taskId is None: raise ValueError(_('must specify taskId')) filters = { True: {}, # inclusive False: {}, # exclusive } for f in selection.filters: filters[f.isInclusive].setdefault(f.filterType, []).append(f) rs = set([ r.rawPieceId for r in SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == taskId) ]) for filter_type, fs in filters[True].iteritems(): result = reduce(operator.or_, [MyFilter.run(f, task) for f in fs]) rs &= result for filter_type, fs in filters[False].iteritems(): result = reduce(operator.or_, [MyFilter.run(f, task) for f in fs]) rs -= result rs = sorted(rs) if selection.limit != None: limit = min(selection.limit, len(rs)) rs = random.sample(rs, limit) return rs
def filter_allocation_context(task, text): cond = m.RawPiece.allocationContext == text q = SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == task.taskId).filter(cond) return set([r.rawPieceId for r in q.all()])
def filter_label(task, labelId): if labelId == MyFilter.ANY: labelId = None else: try: labelId = int(labelId) except: raise ValueError(_('invalid label id: {}').format(labelId)) inner = SS.query(m.WorkEntry.rawPieceId.label('rawPieceId'), m.WorkEntry.entryId.label('entryId')).distinct( m.WorkEntry.rawPieceId).filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.modifiesTranscription).order_by( m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) sub_q = inner.subquery('sub_q') q = sub_q.join(m.AppliedLabel) sel_stmt = select([sub_q.c.rawPieceId], distinct=True, from_obj=q) if labelId != None: sel_stmt = sel_stmt.where(m.AppliedLabel.labelId == labelId) return set([r.rawPieceId for r in SS.bind.execute(sel_stmt)])
def iter_user_work_pool(self, subTask, interval): q_entries = SS.query(m.WorkEntry.entryId, m.WorkEntry.userId ).filter(m.WorkEntry.subTaskId==subTask.subTaskId ).filter(m.WorkEntry.batchId.notin_( SS.query(m.Batch.batchId).filter( m.Batch.subTaskId==subTask.subTaskId)) ).filter(m.WorkEntry.created>=interval.startTime ).distinct(m.WorkEntry.userId, m.WorkEntry.rawPieceId ).order_by(m.WorkEntry.userId, m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) if interval.endTime: q_entries = q_entries.filter(m.WorkEntry.created<=interval.endTime) pools = {} for entryId, userId in q_entries.all(): pools.setdefault(userId, set()).add(entryId) for userId, entryIds in pools.iteritems(): yield (userId, entryIds)
def load_raw_piece_ids(self): self.id2key = {} self.key2id = {} q = SS.query(m.RawPiece.rawPieceId, m.RawPiece.assemblyContext).filter( m.RawPiece.taskId == self.taskId) for rawPieceId, assemblyContext in q.all(): self.id2key[rawPieceId] = assemblyContext self.key2id[assemblyContext] = rawPieceId
def webservices_user_details(): userId = int(request.values['userID']) test_records = SS.query(m.Test, m.Sheet).filter(m.Sheet.userId == userId).filter( m.Sheet.testId == m.Test.testId).filter( m.Sheet.score != None).order_by( m.Sheet.testId, m.Sheet.nTimes.desc()).distinct( m.Sheet.testId).all() assignments = SS.query(m.Task, m.TaskWorker.removed).filter( m.Task.taskId == m.TaskWorker.taskId).filter( m.TaskWorker.userId == userId).order_by( m.TaskWorker.taskId, m.TaskWorker.removed.desc()).distinct( m.TaskWorker.taskId).all() return dict(test_records=test_records, assignments=assignments)
def _get_qa_errors(qaEntryId): errors = [] if qaEntryId is not None: for errorTypeId in SS.query(m.AppliedError.errorTypeId).filter( m.AppliedError.entryId == qaEntryId).all(): t = errorLookUpTable.get(er.errorTypeId, None) if t != None: errors.append(t) return errors
def get_sub_task_work_metrics(subTaskId): # TODO: modify query condition to include interval metrics metrics = m.SubTaskMetric.query.filter_by(subTaskId=subTaskId).all() metrics_i = m.SubTaskMetric.query.filter( m.SubTaskMetric.workIntervalId.in_( SS.query(m.WorkInterval.workIntervalId).filter( m.WorkInterval.subTaskId == subTaskId))).all() return jsonify({ 'metrics': m.SubTaskMetric.dump(metrics + metrics_i), })
def load_unpaid_events(self): # TODO: load words in this query subTaskId = self.subTask.subTaskId unpaid_events = m.PayableEvent.query.filter( m.PayableEvent.subTaskId == subTaskId).filter( m.PayableEvent.calculatedPaymentId == None).filter( m.PayableEvent.batchId.notin_( SS.query(m.Batch.batchId).filter( m.Batch.subTaskId == subTaskId))).all() return unpaid_events
def calculate_task_payment_record(taskId, payrollId): cutOffTime = SS.query(func.max(m.WorkInterval.endTime ).filter(m.WorkInterval.workIntervalId.in_( SS.query(m.CalculatedPayment.workIntervalId.distinct() ).filter_by(taskId=taskId ).filter_by(payrollId=payrollId)) )).first()[0] or m.Payroll.query.get(payrollId).endDate itemCount, unitCount = SS.query(func.count(m.RawPiece.rawPieceId), func.sum(m.RawPiece.words)).filter(m.RawPiece.rawPieceId.in_( SS.query(m.WorkEntry.rawPieceId.distinct() ).filter_by(taskId=taskId ).filter(m.WorkEntry.created<=cutOffTime) )).first() unitCount = unitCount or 0 calculatedSubtotal = SS.query(func.sum(m.CalculatedPayment.amount ).filter(m.CalculatedPayment.taskId==taskId ).filter(m.CalculatedPayment.payrollId<=payrollId) ).first()[0] or 0 otherSubtotal = SS.query(func.sum(m.OtherPayment.amount ).filter(m.OtherPayment.taskId==taskId ).filter(m.OtherPayment.payrollId<=payrollId) ).first()[0] or 0 return m.TaskPaymentRecord(taskId=taskId, payrollId=payrollId, itemCount=itemCount, unitCount=unitCount, cutOffTime=cutOffTime, paymentSubtotal=calculatedSubtotal+otherSubtotal)
def progress_work_intervals(task=None): """ For all intervals which status is 'checking', if there are no more QA batches (for that inteval) left, change status to 'finished' """ q = m.WorkInterval.query.filter( m.WorkInterval.status == m.WorkInterval.STATUS_CHECKING) if task is not None: q = q.filter(m.WorkInterval.taskId == task.taskId) for wi in q.all(): if SS.query(m.Batch.batchId).filter( m.Batch.workIntervalId == wi.workIntervalId).count() == 0: wi.status = m.WorkInterval.STATUS_FINISHED
def filter_work_type_batching(task, workTypeId): workType = m.WorkType.query.get(workTypeId) if not workType: raise ValueError(_('invalid work type id: {}').format(workTypeId)) if workType.name in (m.WorkType.WORK, m.WorkType.REWORK): q = SS.query(m.PageMember.rawPieceId).distinct( m.PageMember.rawPieceId).filter( m.PageMember.taskId == task.taskId).filter( m.PageMember.workType == workType.name) elif workType.name == m.WorkType.QA: q = SS.query(m.WorkEntry.rawPieceId).distinct( m.WorkEntry.rawPieceId).filter( m.WorkEntry.taskId == task.taskId).filter( m.WorkEntry.entryId.in_( SS.query(m.PageMember.workEntryId).filter( m.PageMember.taskId == task.taskId).filter( m.PageMember.workType == m.WorkType.QA))) else: return set() return set([r.rawPieceId for r in q.all()])
def filter_transcribed(task, transcribedOption): if transcribedOption == MyFilter.TRUE: cond = m.RawPiece.isNew.isnot(True) elif transcribedOption == MyFilter.FALSE: cond = m.RawPiece.isNew.is_(True) else: raise ValueError( _('invalid value of transcribed option {}').format( transcribedOption)) q = SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == task.taskId).filter(cond) return set([r.rawPieceId for r in q.all()])
def filter_pp_group(task, groupId): if groupId == MyFilter.ANY: cond = m.RawPiece.groupId.isnot(None) else: try: groupId = int(groupId) except: raise ValueError(_('invalid group id: {}').format(groupId)) cond = m.RawPiece.groupId == groupId q = SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == task.taskId).filter(cond) return set([r.rawPieceId for r in q.all()])
def normalize_error_type_ids(data, key, errorTypeIds): task = data['task'] rs = [] valid_ids = set([ r.errorTypeId for r in SS.query(m.TaskErrorType.errorTypeId).filter( m.TaskErrorType.taskId == task.taskId).filter( m.TaskErrorType.disabled.is_(False)) ]) try: input_ids = set([int(i) for i in errorTypeIds]) except: raise ValueError( _('invalid errorTypeIds input: {0}').format(errorTypeIds)) return sorted(valid_ids & input_ids)
def filter_source_tag(task, tagId): if tagId == MyFilter.ANY: tagId = None cond = m.RawPiece.rawText.contains('tagid=') else: try: tagId == int(tagId) except: raise ValueError(_('invalid tag id: {}').format(tagId)) cond = m.RawPiece.rawText.contains('tagid="%s"' % tagId) q = SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == task.taskId).filter(cond) return set([r.rawPieceId for r in q.all()])
def filter_date_interval(task, workOption, startDate, endDate): try: startDate = datetime.datetime.strptime(startDate, '%Y-%m-%d').date() startDate = datetime.datetime(startDate.year, startDate.month, startDate.day) except: raise ValueError(_('invalid start date: {}').format(startDate)) try: endDate = datetime.datetime.strptime(endDate, '%Y-%m-%d').date() endDate = datetime.datetime(endDate.year, endDate.month, endDate.day) except: raise ValueError(_('invalid end date: {}').format(endDate)) inner = SS.query( m.WorkEntry.rawPieceId, m.WorkEntry.entryId, m.WorkEntry.created).distinct( m.WorkEntry.rawPieceId).filter(m.WorkEntry.taskId == task.taskId) if workOption == MyFilter.ANY: pass elif workOption == MyFilter.FIRST: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created) elif workOption == MyFilter.MOST_RECENT: inner = inner.order_by(m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) elif workOption == MyFilter.MOST_RECENT_MODIFIED: inner = inner.filter(m.WorkEntry.modifiesTranscription).order_by( m.WorkEntry.rawPieceId, m.WorkEntry.created.desc()) else: raise ValueError(_('invalid work option: {}').format(workOption)) sub_q = inner.subquery('sub_q') q = SS.query(sub_q.c.rawPieceId).filter( and_(sub_q.c.created >= startDate, sub_q.c.created <= endDate)) return set([r.rawPieceId for r in q.all()])
def filter_custom_group(task, groupId): q = SS.query(m.CustomUtteranceGroupMember.rawPieceId).distinct( m.CustomUtteranceGroupMember.rawPieceId).join( m.CustomUtteranceGroup).filter( m.CustomUtteranceGroup.taskId == task.taskId) if groupId == MyFilter.ANY: pass else: try: groupId == int(groupId) except: raise ValueError(_('invalid group id: {}').format(groupId)) q = q.filter(m.CustomUtteranceGroup.groupId == groupId) return set([r.rawPieceId for r in q.all()])
def webservices_available_work(): userId = int(request.values['userID']) user = _get_user(userId) if not user or not user.isActive: raise InvalidUsage('user {} not found or inactive'.format(userId)) # is_active = lambda subTask: subTask.task.status == m.Task.STATUS_ACTIVE # has_supervisor = lambda subTask: len([x for x in subTask.task.supervisors # if x.receivesFeedback]) > 0 # pay_rate_set = lambda subTask: bool( # m.SubTaskRate.query.filter_by(subTaskId=subTask.subTaskId # ).filter(m.SubTaskRate.validFrom<=func.now() # ).order_by(m.SubTaskRate.validFrom.desc() # ).first()) # has_batch = lambda subTask: bool( # m.Batch.query.filter_by(subTaskId=subTask.subTaskId # ).filter(m.Batch.userId==None # ).filter(m.Batch.onHold==False # ).order_by(m.Batch.priority.desc() # ).first()) # candidates = Filterable(m.SubTask.query.filter(m.SubTask.subTaskId.in_( # SS.query(m.TaskWorker.subTaskId).filter_by(userId=userId # ).filter(m.TaskWorker.removed==False))).all()) # subTasks = candidates | is_active | has_supervisor | pay_rate_set | has_batch candidates = m.SubTask.query.filter( m.SubTask.subTaskId.in_( SS.query(m.TaskWorker.subTaskId).filter_by(userId=userId).filter( m.TaskWorker.removed == False))).all() subTasks = [] for subTask in candidates: if subTask.task.status != m.Task.STATUS_ACTIVE: continue if not [x for x in subTask.task.supervisors if x.receivesFeedback]: continue if not subTask.currentRate: continue if not m.Batch.query.filter_by(subTaskId=subTask.subTaskId).filter( m.Batch.userId == None).filter(m.Batch.onHold == False).filter( or_(m.Batch.notUserId.is_(None), m.Batch.notUserId != userId)).order_by( m.Batch.priority.desc()).first(): continue subTasks.append(subTask) result = map(format_available_work_entry, subTasks) return dict(entries=result)
def normalize_user_ids(data, key, value): userIds = set() for i in data['users'].split(','): try: userId = int(i) except ValueError: raise ValueError(_('invalid user id: {0}').format(i)) else: userIds.add(userId) confirmed = [ r[0] for r in SS.query(m.User.userId).filter(m.User.userId.in_(userIds)) ] missing = userIds - set(confirmed) if missing: raise ValueError(_('user not found: {0}').format(','.join(missing))) return userIds
def filter_source_word_count(task, wordCountOption, words): try: words = int(words) except: raise ValueError(_('invalid word count: {}').format(words)) if wordCountOption == MyFilter.EQUALS: cond = m.RawPiece.words == words elif wordCountOption == MyFilter.GREATER_THAN: cond = m.RawPiece.words > words elif wordCountOption == MyFilter.LESS_THAN: cond = m.RawPiece.words < words else: raise ValueError(_('invalid word count option: {}').format(words)) q = SS.query(m.RawPiece.rawPieceId).filter( m.RawPiece.taskId == task.taskId).filter(cond) return set([r.rawPieceId for r in q.all()])
def collapse_payable_events(task=None): q_keys = SS.query(m.PayableEvent.rawPieceId, m.PayableEvent.workEntryId, m.PayableEvent.batchId, m.PayableEvent.pageId).group_by( m.PayableEvent.rawPieceId, m.PayableEvent.workEntryId, m.PayableEvent.batchId, m.PayableEvent.pageId).having(func.count('*') > 1) for rawPieceId, workEntryId, batchId, pageId in q_keys.all(): events = m.PayableEvent.query.filter( m.PayableEvent.rawPieceId == rawPieceId).filter( m.PayableEvent.workEntryId == workEntryId).filter( m.PayableEvent.batchId == batchId).filter( m.PayableEvent.pageId == pageId).order_by( m.PayableEvent.created).all() while events: ev = events.pop(0) # delete event if it is neither paid nor the latest if ev.calculatedPaymentId is None and events: SS.delete(ev)