def handle(self, *args, **options): # create a connection mturk = MTurkConnection( getattr(settings, 'MTURK_AWS_KEY', settings.MEDIASYNC['AWS_KEY']), getattr(settings, 'MTURK_AWS_SECRET', settings.MEDIASYNC['AWS_SECRET']), host = 'mechanicalturk.sandbox.amazonaws.com' if options['sandbox'] else 'mechanicalturk.amazonaws.com' ) # if --delete, delete all the old ones first. if options['delete_first']: for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId) if options['exclude']: exclude_reader = csv.DictReader(open(options['exclude'], 'r')) exclude = set() for row in exclude_reader: exclude.add(row['td_id']) # iterate over items and create them one by one cursor = connection.cursor() cursor.execute( """ select entity_id, type from matchbox_wikipediainfo, matchbox_entity where entity_id not in (select entity_id from matchbox_sunlightinfo where bio is not null) and bio != '' and bio is not null and entity_id = matchbox_entity.id %s order by entity_id limit %s; """ % ("and type = '%s'" % options['type'] if options['type'] else '', '%s'), # hack to put the interpolation string back in for PG to catch it [options['count']]) for row in cursor: if options['exclude']: if str(row[0]).replace('-', '') in exclude: continue if options['practice']: print row[0] continue try: hit = mturk.create_hit( question = FakeQuestionForm(get_hit_xml(row[0])), max_assignments = 3, annotation = row[0], title = "Wikipedia match validation", description = "We have matched a set of entities in a database to descriptions pulled from Wikipedia via an automated process. Confirm that the match is correct.", reward = 0.06, duration = datetime.timedelta(minutes=30), lifetime = datetime.timedelta(days=7), keywords = ['wikipedia', 'matching'], approval_delay = datetime.timedelta(days=3), qualifications = Qualifications([PercentAssignmentsApprovedRequirement("GreaterThan", 90)]) ) print hit[0].HITId except Exception as e: sys.stderr.write("Failed to create hit %s\n" % row[0]) sys.stderr.write(getattr(e, 'body', '')) sys.stderr.write('\n') except: pass
def getAllHits(self, hits): mtc = MTurkConnection(aws_access_key_id=self.ACCESS_ID, aws_secret_access_key=self.SECRET_KEY, host=self.HOST) for hit in hits: assignments = mtc.get_assignments(hit) for assignment in assignments: print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: for key, value in question_form_answer.fields: print "%s: %s" % (key,value) mtc.approve_assignment(assignment.AssignmentId) print "--------------------" mtc.disable_hit(hit)
def cancel_hit(hit): hostURL = SANDBOX_HOST if hit.sandbox else HOST connection = MTurkConnection( aws_access_key_id=hit.aws_access_key, aws_secret_access_key=hit.aws_secret_key, host=hostURL ) return connection.disable_hit(hit.mturkid)
def cancel_hit(hit): hostURL = SANDBOX_HOST if hit.sandbox else HOST connection = MTurkConnection(aws_access_key_id=hit.aws_access_key, aws_secret_access_key=hit.aws_secret_key, host=hostURL) return connection.disable_hit(hit.mturkid)
def delete_hits(self, hits_to_delete): print "Connecting to Turk host at" print app.config['MTURK_HOST'] sys.stdout.flush() mturk = MTurkConnection(app.config['AWS_ACCESS_KEY_ID'], app.config['AWS_SECRET_ACCESS_KEY'], host=app.config['MTURK_HOST']) print "Deleting extra hits" for hit in hits_to_delete: try: mturk.disable_hit(hit) except MTurkRequestError: print "Trying to delete hit that doesn't exist" return True
def get_final_score(HITId): mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = mtc.get_all_hits() hits_dict = dict() for hit in hits: hits_dict[hit.HITId] = hit curr_hit = hits_dict[HITId] sum_opin = 0 sum_acc = 0 index = 0 assignments = mtc.get_assignments(curr_hit.HITId) for assignment in assignments: #print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: for key in question_form_answer.fields: if question_form_answer.qid == 'design': #print "%s" % (key) index=index+1 sum_opin+=int(key) else: sum_acc += answer_key(key) mtc.approve_assignment(assignment.AssignmentId) #print "--------------------" mtc.disable_hit(curr_hit.HITId) #print "Average Score %s" % (sum_opin/index) #print "Legible Accuracy: %s%%" % (sum_acc/index) avg_ratings = float(sum_opin) / float(index) avg_ratings_score = avg_ratings * 25 avg_legib_score = float(sum_acc) / float(index) # Calculate weighted average, # 60% for compare match score, # 40^% for ratings weighted_avg = 0.70*avg_legib_score + 0.30*avg_ratings return weighted_avg
class MTurkClient: # SETUP # =========== def __init__(self,aws_access_key,aws_secret_key,aws_mode): self.mode = aws_mode if aws_mode == 'sandbox': self.host = 'mechanicalturk.sandbox.amazonaws.com' else: self.host = 'mechanicalturk.amazonaws.com' self.c = MTurkConnection( aws_access_key, aws_secret_key, host=self.host) default_settings = { 'lifetime': DAY, 'duration': 10 * MINUTE, 'approval_delay': DAY, 'title': "[title]", 'description': "[description]", 'keywords': [], 'reward': 0.01, 'max_assignments': 1, 'height': 700, 'qualifications': [], } # HITS # =========== def create_hit(self,url,extra_settings): "Eventually, this should take a TEMPLATE and a dictionary of INPUT data that's put into that template. This function would then create an HTML file locally (assuming we're running on a web server) by replacing template {tags} with input values, and then send the URL to the newly created page to MTurk." settings = self.default_settings.copy() settings.update(extra_settings) settings['reward'] = Price(settings['reward']) settings['qualifications'] = qualification.Qualifications(settings['qualifications']) settings['keywords'] = ','.join(settings['keywords']) height = settings.pop('height') hit = self.c.create_hit(question=ExternalQuestion(url,height),**settings)[0] #print 'Created hit %s' % hit.HITId return hit.HITId,hit.HITTypeId #hit_type=None, # Let Amazon do this automatically #annotation=None, # Optional annotation for our system to use #questions=None, # If you want to create multiple HITs at a time? Probably irrelevant for External #response_groups=None, # Unclear what this does def get_hit(self,hit_id): return self.c.get_hit(hit_id)[0] def hit_results(self,hit_id,type=None): # type in ['Submitted','Approved','Rejected',None] results = {} assignments = self.c.get_assignments(hit_id, status=None, page_size=100) for asst in assignments: results.setdefault(asst.AssignmentId,{}) answers = asst.answers[0] for qfa in answers: field, response = qfa.qid, qfa.fields[0] results[asst.AssignmentId][field] = response results[asst.AssignmentId]['worker_id'] = asst.WorkerId results[asst.AssignmentId]['accept_time'] = datetime.strptime(asst.AcceptTime,"%Y-%m-%dT%H:%M:%SZ") results[asst.AssignmentId]['submit_time'] = datetime.strptime(asst.SubmitTime,"%Y-%m-%dT%H:%M:%SZ") return results # URL of a HIT on MTurk def hit_url_turk(self,hit_id): pass def hit_url_external(self,hit_id): pass def extend_hit(self,hit_id,extras): return self.c.extend_hit(hit_id, extras) @catcherror def delete_hit(self,hit_id): self.c.disable_hit(hit_id) # Deletes all the HITS on the server. Risky! def cleanup(self): for hit in self.c.get_all_hits(): self.delete_hit(hit.HITId) # ASSIGNMENTS # =========== @catcherror def approve(self, asst_id, feedback=None): return self.c.approve_assignment(asst_id, feedback) @catcherror def reject(self, asst_id, feedback=None): return self.c.reject_assignment(asst_id, feedback) def block(self,worker_id,feedback=None): return self.c.block_worker(worker_id, feedback) def unblock(self,worker_id,feedback=None): return self.c.unblock_worker(worker_id, feedback) def bonus(self,asst,amount,feedback): return self.c.grant_bonus(asst.worker, asst.asst_id, Price(amount), feedback) # STATUS / DIAGNOSTICS # -------------------- def balance(self): return self.c.get_account_balance()[0]
rev_hits = waitUntilHIT1Complete(mtc,hitIds) possibleAns = defaultdict(Set) for hit in rev_hits: if hit.HITId in hitIds: assignments = mtc.get_assignments(hit.HITId) for assignment in assignments: #print("Answers of the worker %s" % assignment.WorkerId) for question_form_answer in assignment.answers[0]: for value in question_form_answer.fields: #print("%s: %s" % (hitsDic[hit.HITId],value)) possibleAns[hitsDic[hit.HITId]].add(value) #print("--------------------") mtc.approve_assignment(assignment.AssignmentId) mtc.disable_hit(hit.HITId) print('Creating the second stage HITS') hitIds = Set() answersDic = {} for key, val in possibleAns.iteritems(): sentence, context = key hitId, answers = createHIT2(val,sentence,context) hitIds.add(hitId) hitsDic[hitId] = (sentence, context) answersDic[sentence] = answers rev_hits = waitUntilHIT1Complete(mtc,hitIds)
def main(argv): if (len(argv) < 2): print "Usage: tweetbeats.py <song_title> <instrument_number> <optional_topic>" else: user_topic = "" # check for command line argument if len(argv) > 2: user_topic = argv[2] ''' ' Gather Tweets ''' print "Gathering Tweets..." tc = TweetCollector() results = tc.CollectTweets(user_topic) print "Topic: " + results[0] ''' ' Create Hits ''' print "Creating HITs..." mtur = MTurk(ACCESS_ID, SECRET_KEY,HOST) for result in results[1]: res = filter(lambda x: x in string.printable, result) new_id = mtur.createHit(res) mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = get_all_reviewable_hits(mtc) while (len(hits) < MIN_TWEETS): print "Not enough hits. Will try again in 10 seconds...." sleep(10) hits = get_all_reviewable_hits(mtc) hits3 = [] for hit in hits: assignments = mtc.get_assignments(hit.HITId) for assignment in assignments: print "Answers of the worker %s" % assignment.WorkerId answers = [] for question_form_answer in assignment.answers[0]: for value in question_form_answer.fields: answers.append(int(value)) print "Responses : ", answers hits3.append(answers) mtc.approve_assignment(assignment.AssignmentId) print "--------------------" mtc.disable_hit(hit.HITId) #Remove unused HITS; make 5 passes to clean up as best we can print "Removing unused HITs... Pass #1 of 5" hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #2 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #3 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #4 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #5 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) ''' ' Make Hits into Music ''' initializeTrack(argv[1]) time = 1 for result in hits3: duration = 0 durationResult = result[1] if durationResult == 1: duration = .375 #dotted sixteenth elif durationResult == 2: duration = .5 #eighth elif durationResult == 3: duration = .75 #dotted eigth elif durationResult == 4: duration = 1 #quarter elif durationResult == 5: duration = 1.5 #dotted quarter elif durationResult == 6: duration = 2 #half elif durationResult == 7: duration = 3 #dotted half elif durationResult == 8: duration = 4 #whole shift = random.choice([-11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) chord = result[0] if chord == 1: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, -1) #C maj Joy elif chord == 2: addChord(time, duration, 100, 60 + shift, 63 + shift, 67 + shift, 70 + shift) #C min9 Sadness elif chord == 3: addChord(time, duration, 100, 60 + shift, 64 + shift, 66 + shift, 69 + shift) #C dim7 Anger elif chord == 4: addChord(time, duration, 100, 60 + shift, 64 + shift, 66 + shift, -1) #C flat5 Fear elif chord == 5: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, 69 + shift) #C maj6 Trust elif chord == 6: addChord(time, duration, 100, 60 + shift, 63 + shift, 67 + shift, 69 + shift) #C m6 Distrust elif chord == 7: addChord(time, duration, 100, 60 + shift, 63 + shift, 66 + shift, 70 + shift) #C m7b5 Surprise elif chord == 8: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, 71 + shift) #C maj7 Anticipation time += duration addChord(time, 4, 000, 60, 60, 60, 60) #silence to allow last note to fade out closeTrack(argv[0]) music_file = argv[0] + ".mid" # set up the mixer freq = 44100 # audio CD quality bitsize = -16 # unsigned 16 bit channels = 2 # 1 is mono, 2 is stereo buffer = 2048 # number of samples pygame.mixer.init(freq, bitsize, channels, buffer) # optional volume 0 to 1.0 pygame.mixer.music.set_volume(1.0) pygame.mixer.music.load(music_file) print "Music file %s loaded!" % music_file clock = pygame.time.Clock() pygame.mixer.music.play() while pygame.mixer.music.get_busy(): # check if playback has finished clock.tick(30)
def disableHit(self, hit_id): mtc = MTurkConnection(aws_access_key_id=self.ACCESS_ID, aws_secret_access_key=self.SECRET_KEY, host=self.HOST) mtc.disable_hit(hit_id, response_groups=None)
class MTurkProvider(object): description = 'This is a task authored by a requester on Daemo, a research crowdsourcing platform. ' \ 'Mechanical Turk workers are welcome to do it' keywords = ['daemo'] countries = ['US', 'CA'] min_hits = 1000 def __init__(self, host, aws_access_key_id, aws_secret_access_key): self.host = host self.connection = MTurkConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=settings.MTURK_HOST) self.connection.APIVersion = "2014-08-15" if not self.host: raise ValueError("Please provide a host url") def get_connection(self): return self.connection @staticmethod def _mturk_system_qualifications(qualification): requirements = [] for item in qualification.items.all(): if item.expression['attribute'] not in [ 'location', 'approval_rate', 'total_tasks' ]: continue requirement = None if item.expression['attribute'] == 'location': op = OP_IN if item.expression['operator'] == 'in' else OP_NOT_IN requirement = MultiLocaleRequirement(op, [ val.strip() for val in item.expression['value'] if val is not None and val != '' ]) elif item.expression['attribute'] == 'approval_rate': op = OP_GT if item.expression['operator'] == 'gt' else OP_LT requirement = PercentAssignmentsApprovedRequirement( op, item.expression['value']) elif item.expression['attribute'] == 'total_tasks': op = OP_GT if item.expression['operator'] == 'gt' else OP_LT requirement = NumberHitsApprovedRequirement( op, item.expression['value']) requirements.append(requirement) return requirements def get_qualifications(self, project, boomerang_threshold, add_boomerang): requirements = [] if project.qualification is not None: requirements += self._mturk_system_qualifications( project.qualification) boomerang_qual, success = self.create_qualification_type( owner_id=project.owner_id, project_id=project.group_id, name='Boomerang Score #{}'.format(project.group_id), flag=FLAG_Q_BOOMERANG, description='No description available') boomerang = None if boomerang_threshold <= int(settings.BOOMERANG_MIDPOINT * 100): for i, bucket in enumerate(WAIT_LIST_BUCKETS): if int(bucket[1] * 100) <= boomerang_threshold: boomerang_blacklist, success = \ self.create_qualification_type(owner_id=project.owner_id, name='Boomerang Waitlist #{}-{}'.format(project.group_id, len( WAIT_LIST_BUCKETS) - i), flag=FLAG_Q_BOOMERANG, description='No description available', deny=True, project_id=project.group_id, bucket=bucket) if success and add_boomerang: boomerang = BoomerangRequirement( qualification_type_id=boomerang_blacklist.type_id, comparator=OP_DNE, integer_value=None) requirements.append(boomerang) else: boomerang = BoomerangRequirement( qualification_type_id=boomerang_qual.type_id, comparator=OP_GTEQ, integer_value=boomerang_threshold) if success and add_boomerang: requirements.append(boomerang) return Qualifications(requirements), boomerang_qual def create_hits(self, project, tasks=None, repetition=None): # if project.min_rating > 0: # return 'NOOP' if not tasks: cursor = connection.cursor() # noinspection SqlResolve query = ''' SELECT max(id) id, repetition, group_id, repetition - sum(existing_assignments) remaining_assignments, min_rating FROM ( SELECT t_rev.id, t.group_id, t.min_rating, p.repetition, CASE WHEN ma.id IS NULL OR ma.status IN (%(skipped)s, %(rejected)s, %(expired)s) THEN 0 ELSE 1 END existing_assignments FROM crowdsourcing_task t INNER JOIN crowdsourcing_project p ON t.project_id = p.id INNER JOIN crowdsourcing_task t_rev ON t_rev.group_id = t.group_id LEFT OUTER JOIN mturk_mturkhit mh ON mh.task_id = t_rev.id LEFT OUTER JOIN mturk_mturkassignment ma ON ma.hit_id = mh.id WHERE t.project_id = (%(project_id)s) AND t_rev.exclude_at IS NULL AND t_rev.deleted_at IS NULL ) t GROUP BY group_id, repetition, min_rating HAVING sum(existing_assignments) < repetition; ''' cursor.execute( query, { 'skipped': TaskWorker.STATUS_SKIPPED, 'rejected': TaskWorker.STATUS_REJECTED, 'expired': TaskWorker.STATUS_EXPIRED, 'project_id': project.id }) tasks = cursor.fetchall() rated_workers = Rating.objects.filter( origin_type=Rating.RATING_REQUESTER).count() add_boomerang = rated_workers > 0 duration = project.timeout if project.timeout is not None else datetime.timedelta( hours=24) lifetime = project.deadline - timezone.now( ) if project.deadline is not None else datetime.timedelta(days=7) for task in tasks: question = self.create_external_question(task[0]) mturk_hit = MTurkHIT.objects.filter(task_id=task[0]).first() qualifications, boomerang_qual = self.get_qualifications( project=project, boomerang_threshold=int(round(task[4], 2) * 100), add_boomerang=add_boomerang) qualifications_mask = 0 if qualifications is not None: qualifications_mask = FLAG_Q_LOCALE + FLAG_Q_HITS + FLAG_Q_RATE + FLAG_Q_BOOMERANG hit_type, success = self.create_hit_type( title=project.name, description=self.description, price=project.price, duration=duration, keywords=self.keywords, approval_delay=datetime.timedelta(days=2), qual_req=qualifications, qualifications_mask=qualifications_mask, boomerang_threshold=int(round(task[4], 2) * 100), owner_id=project.owner_id, boomerang_qual=boomerang_qual) if not success: return 'FAILURE' if mturk_hit is None: try: hit = self.connection.create_hit( hit_type=hit_type.string_id, max_assignments=task[3], lifetime=lifetime, question=question)[0] self.set_notification(hit_type_id=hit.HITTypeId) mturk_hit = MTurkHIT(hit_id=hit.HITId, hit_type=hit_type, task_id=task[0]) except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.MechanicalTurk.InsufficientFunds': message = { "type": "ERROR", "detail": "Insufficient funds on your Mechanical Turk account!", "code": error } redis_publisher = RedisPublisher(facility='bot', users=[project.owner]) message = RedisMessage(json.dumps(message)) redis_publisher.publish_message(message) return 'FAILED' else: if mturk_hit.hit_type_id != hit_type.id: result, success = self.change_hit_type_of_hit( hit_id=mturk_hit.hit_id, hit_type_id=hit_type.string_id) if success: mturk_hit.hit_type = hit_type mturk_hit.save() return 'SUCCESS' def create_hit_type(self, owner_id, title, description, price, duration, boomerang_threshold, keywords=None, approval_delay=None, qual_req=None, qualifications_mask=0, boomerang_qual=None): hit_type = MTurkHITType.objects.filter( owner_id=owner_id, name=title, description=description, price=Decimal(str(price)), duration=duration, qualifications_mask=qualifications_mask, boomerang_threshold=boomerang_threshold).first() if hit_type is not None: return hit_type, True reward = Price(price) try: mturk_ht = self.connection.register_hit_type( title=title, description=description, reward=reward, duration=duration, keywords=keywords, approval_delay=approval_delay, qual_req=qual_req)[0] hit_type = MTurkHITType(owner_id=owner_id, name=title, description=description, price=Decimal(str(price)), keywords=keywords, duration=duration, qualifications_mask=qualifications_mask, boomerang_qualification=boomerang_qual, boomerang_threshold=boomerang_threshold) hit_type.string_id = mturk_ht.HITTypeId hit_type.save() except MTurkRequestError: return None, False return hit_type, True def create_external_question(self, task, frame_height=800): task_hash = Hashids(salt=settings.SECRET_KEY, min_length=settings.ID_HASH_MIN_LENGTH) task_id = task_hash.encode(task) url = self.host + '/mturk/task/?taskId=' + task_id question = ExternalQuestion(external_url=url, frame_height=frame_height) return question def update_max_assignments(self, task): task = Task.objects.get(id=task['id']) mturk_hit = task.mturk_hit if not mturk_hit: raise MTurkHIT.DoesNotExist( "This task is not associated to any mturk hit") assignments_completed = task.task_workers.filter(~Q(status__in=[ TaskWorker.STATUS_REJECTED, TaskWorker.STATUS_SKIPPED, TaskWorker.STATUS_EXPIRED ])).count() remaining_assignments = task.project.repetition - assignments_completed if remaining_assignments > 0 and mturk_hit.num_assignments == mturk_hit.mturk_assignments. \ filter(status=TaskWorker.STATUS_SUBMITTED).count() and \ mturk_hit.mturk_assignments.filter(status=TaskWorker.STATUS_IN_PROGRESS).count() == 0: self.add_assignments(hit_id=mturk_hit.hit_id, increment=1) self.extend_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_IN_PROGRESS mturk_hit.num_assignments += 1 mturk_hit.save() elif remaining_assignments == 0: self.expire_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_EXPIRED mturk_hit.save() elif remaining_assignments > 0 and \ mturk_hit.status == MTurkHIT.STATUS_EXPIRED: self.extend_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_IN_PROGRESS return 'SUCCESS' def get_assignment(self, assignment_id): try: return self.connection.get_assignment(assignment_id)[0], True except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.MechanicalTurk.InvalidAssignmentState': return assignment_id, False return None, False def set_notification(self, hit_type_id): self.connection.set_rest_notification( hit_type=hit_type_id, url=self.host + '/api/mturk/notification', event_types=[ 'AssignmentReturned', 'AssignmentAbandoned', 'AssignmentAccepted', 'AssignmentSubmitted' ]) def approve_assignment(self, task_worker): task_worker_obj = TaskWorker.objects.get(id=task_worker['id']) if hasattr(task_worker_obj, 'mturk_assignments' ) and task_worker_obj.mturk_assignments.first() is not None: try: self.connection.approve_assignment( task_worker_obj.mturk_assignments.first().assignment_id) except MTurkRequestError: return False return True def reject_assignment(self, task_worker): task_worker_obj = TaskWorker.objects.get(id=task_worker['id']) if hasattr(task_worker_obj, 'mturk_assignments' ) and task_worker_obj.mturk_assignments.first() is not None: try: self.connection.reject_assignment( task_worker_obj.mturk_assignments.first().assignment_id) except MTurkRequestError: return False return True def expire_hit(self, hit_id): try: self.connection.expire_hit(hit_id) except MTurkRequestError: return False return True def disable_hit(self, hit_id): try: self.connection.disable_hit(hit_id) except MTurkRequestError: return False return True def extend_hit(self, hit_id): try: self.connection.extend_hit(hit_id=hit_id, expiration_increment=604800) # 7 days except MTurkRequestError: return False return True def add_assignments(self, hit_id, increment=1): try: self.connection.extend_hit(hit_id=hit_id, assignments_increment=increment) except MTurkRequestError: return False return True def test_connection(self): try: return self.connection.get_account_balance()[0], True except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.NotAuthorized': return None, False return None, False def get_account_balance(self): try: return self.connection.get_account_balance()[0] except MTurkRequestError: return None def create_qualification_type(self, owner_id, name, flag, description, project_id, auto_granted=False, auto_granted_value=None, deny=False, bucket=None): # noinspection SqlResolve query = ''' SELECT * FROM ( SELECT task.target_id, task.username, round(task.task_w_avg::NUMERIC, 2) rating --round(coalesce(task.task_w_avg, requester.requester_w_avg, -- platform.platform_w_avg)::NUMERIC, 2) rating FROM ( SELECT target_id, origin_id, project_id, username, sum(weight * power((%(BOOMERANG_TASK_ALPHA)s), t.row_number)) / sum(power((%(BOOMERANG_TASK_ALPHA)s), t.row_number)) task_w_avg FROM ( SELECT r.id, r.origin_id, p.group_id project_id, weight, r.target_id, -1 + row_number() OVER (PARTITION BY target_id ORDER BY tw.created_at DESC) AS row_number, u.username username FROM crowdsourcing_rating r INNER JOIN crowdsourcing_task t ON t.id = r.task_id INNER JOIN crowdsourcing_project p ON p.id = t.project_id INNER JOIN crowdsourcing_taskworker tw ON t.id = tw.task_id AND tw.worker_id=r.target_id INNER JOIN auth_user u ON u.id = r.target_id WHERE origin_id = (%(origin_id)s) AND origin_type = (%(origin_type)s)) t GROUP BY origin_id, target_id, project_id, username) task WHERE task.project_id = (%(project_id)s) ) r ''' extra_query = 'WHERE rating BETWEEN (%(lower_bound)s) AND (%(upper_bound)s);' params = { 'origin_type': Rating.RATING_REQUESTER, 'origin_id': owner_id, 'project_id': project_id, 'BOOMERANG_REQUESTER_ALPHA': settings.BOOMERANG_REQUESTER_ALPHA, 'BOOMERANG_PLATFORM_ALPHA': settings.BOOMERANG_PLATFORM_ALPHA, 'BOOMERANG_TASK_ALPHA': settings.BOOMERANG_TASK_ALPHA } obj_params = {'upper_bound': 300, 'lower_bound': 100} if deny and bucket is not None: query += extra_query params.update({'upper_bound': bucket[1], 'lower_bound': bucket[0]}) obj_params.update({ 'upper_bound': bucket[1] * 100, 'lower_bound': bucket[0] * 100, 'is_blacklist': True }) cursor = connection.cursor() cursor.execute(query, params=params) worker_ratings_raw = cursor.fetchall() worker_ratings = [{ "worker_id": r[0], "worker_username": r[1], "rating": r[2] } for r in worker_ratings_raw] qualification = MTurkQualification.objects.filter(owner_id=owner_id, flag=flag, name=name).first() assigned_workers = [] if qualification is None: try: qualification_type = self.connection. \ create_qualification_type(name=name, description=description, status='Active', auto_granted=auto_granted, auto_granted_value=auto_granted_value)[0] qualification = MTurkQualification.objects.create( owner_id=owner_id, flag=flag, name=name, description=description, auto_granted=auto_granted, auto_granted_value=auto_granted_value, type_id=qualification_type.QualificationTypeId, **obj_params) except MTurkRequestError: return None, False else: assigned_workers = MTurkWorkerQualification.objects.values( 'worker').filter(qualification=qualification).values_list( 'worker', flat=True) for rating in worker_ratings: user_name = rating["worker_username"].split('.') if len(user_name) == 2 and user_name[0] == 'mturk': mturk_worker_id = user_name[1].upper() if mturk_worker_id not in assigned_workers: self.assign_qualification( qualification_type_id=qualification.type_id, worker_id=mturk_worker_id, value=int(rating['rating'] * 100)) defaults = { 'qualification': qualification, 'worker': mturk_worker_id, 'score': int(rating['rating'] * 100) } MTurkWorkerQualification.objects.update_or_create( qualification=qualification, worker=mturk_worker_id, defaults=defaults) return qualification, True def change_hit_type_of_hit(self, hit_id, hit_type_id): try: result = self.connection.change_hit_type_of_hit( hit_id=hit_id, hit_type=hit_type_id) except MTurkRequestError: return None, False return result, True def update_worker_boomerang(self, project_id, worker_id, task_avg, requester_avg): """ Update boomerang for project Args: project_id: worker_id: task_avg: requester_avg Returns: str """ hit = MTurkHIT.objects.select_related( 'hit_type__boomerang_qualification').filter( task__project__group_id=project_id).first() if hit is not None: qualification = hit.hit_type.boomerang_qualification worker_qual = MTurkWorkerQualification.objects.filter( qualification=qualification, worker=worker_id).first() if worker_qual is not None: self.update_score(worker_qual, score=int(task_avg * 100), override=True) else: MTurkWorkerQualification.objects.create( qualification=qualification, worker=worker_id, score=int(task_avg * 100), overwritten=True) self.assign_qualification( qualification_type_id=qualification.type_id, worker_id=worker_id, value=int(task_avg * 100)) # other_quals = MTurkWorkerQualification.objects.filter(~Q(qualification=qualification), # worker=worker_id, # overwritten=False) # for q in other_quals: # self.update_score(q, score=int(requester_avg * 100)) return 'SUCCESS' def update_score(self, worker_qual, score, override=False): if worker_qual is None: return False try: self.connection.update_qualification_score( worker_qual.qualification.type_id, worker_qual.worker, score) worker_qual.overwritten = override worker_qual.score = score worker_qual.save() except MTurkRequestError: return False return True def assign_qualification(self, qualification_type_id, worker_id, value=1): """ Revoke a qualification from a WorkerId Args: qualification_type_id: worker_id: value Returns: bool """ try: self.connection.assign_qualification(qualification_type_id, worker_id, value, send_notification=False) return True except MTurkRequestError: return False def revoke_qualification(self, qualification_type_id, worker_id): try: self.connection.revoke_qualification( qualification_type_id=qualification_type_id, subject_id=worker_id) return True except MTurkRequestError: return False def notify_workers(self, worker_ids, subject, message_text): try: self.connection.notify_workers(worker_ids, subject, message_text) return True except MTurkRequestError: return False
class HitCreator(): def __init__(self): if settings.IS_DEV_ENV or settings.USE_AMT_SANDBOX: HOST = 'mechanicalturk.sandbox.amazonaws.com' else: HOST = 'mechanicalturk.amazonaws.com' self.connection = MTurkConnection( aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, host=HOST) def createHitFrom(self, audioSnippet, hitType, numIncorrectWords=None): if hitType == "fix": suffix = "fixHIT" # half cent per incorrect word, up to eight words assert isinstance(numIncorrectWords, int) amount = max(min(.05, numIncorrectWords*.005), .02) elif hitType == "check": suffix = "checkHIT" amount = 0.05 else: assert False if settings.IS_DEV_ENV: baseurl = 'https://localhost:5000/hit/' + suffix else: baseurl = "https://transcroobie.herokuapp.com/hit/" + suffix title = "Transcribe a short audio clip." description = "Transcribe the audio. Words may be cut off at the beginning"\ " or end of the segment. Do not worry about correctly"\ " transcribing these words." keywords = ["transcription"] frame_height = 800 thisDocUrl = baseurl + "?docId=" + str(audioSnippet.pk) questionform = ExternalQuestion(thisDocUrl, frame_height) resultSet = self.connection.create_hit( title=title, description=description, keywords=keywords, max_assignments=1, question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), # I don't know what response groups are ) assert len(resultSet) == 1 audioSnippet.activeHITId = resultSet[0].HITId audioSnippet.save() def deleteHit(self, hitID): try: self.connection.disable_hit(hitID) except MTurkRequestError as e: print "HIT already deleted", e def deleteAllHits(self): allHits = [hit for hit in self.connection.get_all_hits()] for hit in allHits: print "Disabling hit ", hit.HITId self.deleteHit(hit.HITId) def processHit(self, questionFormAnswers): # Process each HIT only once. This function will set activeHITId to "" # to let you know that the HIT is completed and processed. hitType = None response = None audioSnippet = None fixWords = {} for questionFormAnswer in questionFormAnswers: if questionFormAnswer.qid == "asFileId": asFileId = questionFormAnswer.fields[0] audioSnippet = get_object_or_404(AudioSnippet, pk = asFileId) elif questionFormAnswer.qid == "fixedHITResult": hitType = "fix" response = None # need to look at word_%d based on audiosnippet elif questionFormAnswer.qid.startswith("word_"): fixWords[questionFormAnswer.qid] = questionFormAnswer.fields[0] elif questionFormAnswer.qid == "checkedHITResult": hitType = "check" responseStr = questionFormAnswer.fields[0] response = [val == 'true' for val in responseStr.split(',')] numIncorrectWords = 0 if hitType == "fix": # Get the list of words marked incorrect, and count them incorrectWords = audioSnippet.incorrectWords['bools'][-1] numIncorrectWords = len(incorrectWords)-sum(incorrectWords) # Get the last prediction to interpret incorrectWords prediction = audioSnippet.predictions[-1].split() # Convert the last prediction to what was actually sent to # the user predictionSpaced = transcriptWithSpacesAndEllipses(prediction) assert len(incorrectWords) == len(predictionSpaced) words, isCorrect = combineConsecutiveDuplicates(predictionSpaced, incorrectWords) response = "" for i in xrange(len(words)): if not isCorrect[i]: response += fixWords["word_" + str(i)] + " " else: # Only add punctuation (" ") and ellipses if marked incorrect word = words[i] if word.isspace() or word == "": continue elif i == 0 and word.startswith("..."): word = word[3:] # remove initial ellipses elif i == len(words)-1 and word.endswith("..."): word = word[:-3] # remove trailing ellipses response += word.strip() + " " audioSnippet.predictions.append(response) # Always do a check after a fix completionStatus = CompletionStatus.incomplete else: audioSnippet.incorrectWords['bools'].append(response) completionStatus = self.getCompletionStatus(audioSnippet, response) if completionStatus == CompletionStatus.correct: audioSnippet.hasBeenValidated = True audioSnippet.isComplete = True elif completionStatus == CompletionStatus.givenup: audioSnippet.hasBeenValidated = False audioSnippet.isComplete = True audioSnippet.activeHITId = "" if completionStatus == CompletionStatus.incomplete: if hitType == "check": # CHECK task complete. Create a FIX task (since not # hasBeenValidated) self.createHitFrom(audioSnippet, 'fix', numIncorrectWords) elif hitType == "fix": # FIX task complete. Create a CHECK task. self.createHitFrom(audioSnippet, 'check') audioSnippet.save() def getCompletionStatus(self, audioSnippet, response): # only callwhen all hitTypes == "check" # returns a CompletionStatus MAX_NUM_PREDICTIONS = 2 completionStatus = CompletionStatus.incomplete if all(response): completionStatus = CompletionStatus.correct elif len(audioSnippet.predictions) > MAX_NUM_PREDICTIONS: completionStatus = CompletionStatus.givenup return completionStatus def processHits(self, doc): """ Returns whether or not the doc had a newly-completed HIT which was processed. """ assert not doc.completeTranscript audioSnippets = doc.audioSnippets.order_by('id') newHITCompleted = False assignments = [] for audioSnippet in audioSnippets: hitID = audioSnippet.activeHITId if not hitID: continue try: hit = self.connection.get_hit(hitID) except MTurkRequestError as e: logger.error("Perhaps this HIT no longer exists: " + str(e)) continue asgnForHit = self.connection.get_assignments(hit[0].HITId) if asgnForHit: # Hit is ready. Get the data. for asgn in asgnForHit: assignments.append(asgn) questionFormAnswers = asgn.answers[0] self.processHit(questionFormAnswers) newHITCompleted = True statuses = [a.isComplete for a in audioSnippets] if all([a.hasBeenValidated for s in statuses]) or \ all([a.isComplete for a in audioSnippets]): # Note: if the conditional is not met, predictions may be an empty # array. Don't run this next line outside of this conditional. # (Happens only in a race condition after the audioSnippet is # uploaded, and before it adds its first prediction.) responses = [a.predictions[-1] for a in audioSnippets] # All tasks complete for first time totalString = overlap.combineSeveral(responses) doc.completeTranscript = totalString doc.save() return newHITCompleted def isTaskReady(self, hitID): return len(self.connection.get_assignments(hitID)) > 0 def approveAllHits(self): # Approve hits: for assignment in self.getAllAssignments(): try: self.connection.approve_assignment(assignment.AssignmentId) except MTurkRequestError as e: # Maybe already approved? logger.error("MTurk Request Error: " + str(e)) def checkIfHitsReady(self): return True def getAllAssignments(self): allHits = [hit for hit in self.connection.get_all_hits()] # Approve hits: for hit in allHits: assignments = self.connection.get_assignments(hit.HITId) for assignment in assignments: yield assignment
class ElicitationPipelineHandler(object): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] try: self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) except Exception as e: print(e) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn, TEMPLATE_DIR) self.mh = MongoElicitationHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.batch_cost = 1 if self.balance > self.batch_cost: self.balance = self.batch_cost else: raise IOError self.logger = logging.getLogger( "transcription_engine.elicitation_pipeline_handler") def load_PromptSource_RawToList(self, prompt_file_uri): """Create the prompt artifacts from the source.""" prompt_dict = self.ph.get_prompts(prompt_file_uri) disk_space = os.stat(prompt_file_uri).st_size source_id = self.mh.create_prompt_source_artifact( prompt_file_uri, disk_space, len(prompt_dict)) normalizer = Normalize() for key in prompt_dict: prompt, line_number = prompt_dict[key] normalized_prompt = normalizer.rm_prompt_normalization(prompt) self.mh.create_prompt_artifact(source_id, prompt, normalized_prompt, line_number, key, len(prompt)) def load_assignment_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", {"_id": assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue #pass else: have_all_assignments = False recording_ids = [] prompt_id_tag = "prompt_id" recording_url_tag = "recording_url" worker_id_tag = "worker_id" recording_dict = self.ah.get_assignment_submitted_text_dict( assignment, prompt_id_tag, recording_url_tag) worker_oid = self.mh.create_worker_artifact( assignment.WorkerId) zipcode = None for recording in recording_dict: if recording[prompt_id_tag] == "zipcode": zipcode = recording[recording_url_tag] continue if not self.mh.get_artifact_by_id( "prompts", recording[prompt_id_tag]): self.logger.info("Assignment(%s) with unknown %s(%s) skipped"%\ (assignment_id,prompt_id_tag,recording[prompt_id_tag])) break recording_id = self.mh.create_recording_source_artifact( recording[prompt_id_tag], recording[recording_url_tag], recording[worker_id_tag]) if not recording_id: self.mh.create_assignment_artifact(assignment, recording_ids, zipcode=zipcode, incomplete=True) break self.mh.add_item_to_artifact_set( "prompts", recording[prompt_id_tag], "recording_sources", recording_id) recording_ids.append(recording_id) else: self.mh.create_assignment_artifact(assignment, recording_ids, zipcode=zipcode) self.mh.add_item_to_artifact_set( "elicitation_hits", hit_id, "submitted_assignments", assignment_id) self.mh.add_item_to_artifact_set( "workers", worker_oid, "submitted_assignments", assignment_id) print("Elicitation HIT(%s) submitted assignments: %s " % (hit_id, assignment_ids)) def approve_assignment_submitted_to_approved(self): """Approve all submitted assignments""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", { "_id": assignment_id, "state": "Submitted" }): #WARNING: this Approves every assignment self.conn.approve_assignment( assignment_id, "Thank you for completing this assignment!") self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) def approve_assignment_by_worker(self): """Approve all submitted assignments""" approval_comment = "Thank you for your recordings, good work, assignment approved!" denial_comment = "I'm sorry but your work was denied because %s" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", { "_id": assignment_id, "state": "Submitted" }): #WARNING: this Approves every assignment assignment_artifact = self.mh.get_artifact( "elicitation_assignments", {"_id": assignment_id}) recording_ids = assignment_artifact["recordings"] worker = self.mh.get_artifact( "workers", {"eid": assignment_artifact["worker_id"]}) if worker["state"] == "Approved": #If the worker is approved, approve the assignment automatically self.conn.approve_assignment( assignment_id, approval_comment) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) continue elif worker["state"] == "Rejected": self.conn.reject_assignment( assignment_id, worker["rejection_reason"]) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) continue recording_uris = [] for recording_id in recording_ids: uri = self.mh.get_artifact_by_id( "recording_sources", recording_id, "recording_uri") recording_uris.append(uri) command = ["gnome-mplayer"] + recording_uris if len(recording_uris) > 0 and recording_uris[ 0].endswith(" .wav") or recording_uris[ 0].endswith(".com.wav"): continue print("Calling: %s" % command) call(command) approve_assignment = raw_input( "Approve assignment(y/n/s)?") if approve_assignment == "s": #skip continue elif approve_assignment == "y": #accept the assignment self.conn.approve_assignment( assignment_id, approval_comment) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) approve_worker = raw_input("Approve worker(y/n)?") if approve_worker == "y": #approve the worker and all future assignments self.mh.update_artifact_by_id( "workers", worker["_id"], "approval_time", datetime.datetime.now()) elif approve_assignment == "n": #Reject the assignment reject_worker = raw_input( "Reject this worker's future work?") if reject_worker == "y": #Reject the worker reason = raw_input( "Reason for rejecting this worker's future work:" ) self.mh.update_artifact_by_id( "workers", worker["_id"], "rejection_reason", reason) self.conn.reject_assignment( assignment_id, denial_comment % reason + ".") else: reason = raw_input( "Why reject the assignment?") self.conn.reject_assignment( assignment_id, denial_comment % reason + ".") def get_assignment_stats(self): effective_hourly_wage = self.effective_hourly_wage_for_approved_assignments( .20) def effective_hourly_wage_for_approved_assignments(self, reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state( "elicitation_assignments", "Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: accepted = datetime.datetime.strptime(assignment["AcceptTime"], "%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime(assignment["SubmitTime"], "%Y-%m-%dT%H:%M:%SZ") total += submitted - accepted count += 1 #self.mh.update_artifact_by_id("elicitation_assignments", assignment["_id"], "SubmitTime", completion_time) seconds_per_assignment = total.total_seconds() / count effective_hourly_wage = 60.0 * 60.0 / seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s" % (seconds_per_assignment, reward_per_assignment, effective_hourly_wage)) def enqueue_prompts_and_generate_hits(self): prompts = self.mh.get_artifacts_by_state("prompts", "New") for prompt in prompts: self.mh.enqueue_prompt(prompt["_id"], 1, 5) prompt_queue = self.mh.get_prompt_queue() prompt_pairs = self.mh.get_prompt_pairs(prompt_queue) if prompt_pairs: hit_title = "Audio Elicitation" question_title = "Speak and Record your Voice" hit_description = "Speak the prompt and record your voice." keywords = "audio, elicitation, speech, recording" if cost_sensitive: reward_per_clip = 0.04 max_assignments = 2 estimated_cost = self.hh.estimate_html_HIT_cost(prompt_pairs,reward_per_clip=reward_per_clip,\ max_assignments=max_assignments) prompts_in_hits = self.mh.prompts_already_in_hit( prompt_pairs) if prompts_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_artifact_from_queue(prompts_in_hits) elif self.balance - estimated_cost >= 0: #if we have enough money, create the HIT response = self.hh.make_html_elicitation_HIT( prompt_pairs, hit_title, question_title, keywords, hit_description, max_assignments=max_assignments, reward_per_clip=reward_per_clip) # response = self.hh.make_question_form_elicitation_HIT(prompt_pairs,hit_title, # question_title, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len( response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_artifacts_from_queue( "prompt_queue", prompt_queue) prompt_ids = [w["prompt_id"] for w in prompt_queue] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_elicitation_hit_artifact( hit_id, hit_type_id, prompt_ids) self.mh.update_artifacts_by_id( "prompts", prompt_ids, "hit_id", hit_id) self.logger.info("Successfully created HIT: %s" % hit_id) else: return True print("Amount left in batch: %s out of %s" % (self.balance, self.batch_cost)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() selection = raw_input("Remove all hits with no assignments?") if selection == "y": for hit in hits: hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: try: self.conn.disable_hit(hit_id) prompts = self.mh.get_artifact("elicitation_hits", {"_id": hit_id}, "prompts") self.mh.remove_elicitation_hit(hit_id) if prompts: self.mh.update_artifacts_state("prompts", prompts) else: pass except MTurkRequestError as e: raise e return True for hit in hits: hit_id = hit.HITId print("HIT ID: %s" % hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)" ) == "y": try: self.conn.disable_hit(hit_id) prompts = self.mh.get_artifact("elicitation_hits", {"_id": hit_id}, "prompts") self.mh.remove_elicitation_hit(hit_id) if prompts: self.mh.update_artifacts_state("prompts", prompts) else: pass except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)" % len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" selection = 0 #self.get_time_submitted_for_assignments() while selection != "8": selection = raw_input( """Prompt Source raw to Elicitations-Approved Pipeline:\n 1: PromptSource-Load_RawToList: Load Resource Management 1 prompt source files to queueable prompts 2: Prompt-ReferencedToHit: Queue all referenced prompts and create a HIT if the queue is full. 3: Prompt-HitToAssignmentSubmitted: Check all submitted assignments for Elicitations and download elicitations. 4: Maintain all assignments and hits. 5: (WARNING, approves all assignments) Approve all submitted assignments. 6: Calculate assignment stats. 7: Hand approve submitted assignments by elicitation and/or by worker. 8: Exit """) if selection == "1": self.load_PromptSource_RawToList(prompt_file_uri) elif selection == "2": self.enqueue_prompts_and_generate_hits() elif selection == "3": self.load_assignment_hit_to_submitted() elif selection == "4": self.allhits_liveness() elif selection == "5": self.approve_assignment_submitted_to_approved() elif selection == "6": self.get_assignment_stats() elif selection == "7": self.approve_assignment_by_worker() else: selection = "8" # prompt_dict = self.ph.get_prompts(prompt_file_uri) # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("elicitation_assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("elicitation_assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime)
class TranscriptionPipelineHandler(): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn,TEMPLATE_DIR) self.mh = MongoTranscriptionHandler() self.wh = WavHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.logger = logging.getLogger("transcription_engine.transcription_pipeline_handler") def audio_clip_referenced_to_hit(self,priority=1,max_queue_size=10): for audio_clip in self.mh.get_artifacts_by_state("audio_clips","Referenced"): audio_clip_id = audio_clip["_id"] self.mh.queue_clip(audio_clip_id, priority, max_queue_size) response = self.audio_clip_queue_to_hit() def audio_clip_queued_to_hit(self,priority=1,max_queue_size=10): for audio_clip in self.mh.get_artifacts("audio_clips",{"state":"Queued"}): audio_clip_id = audio_clip["_id"] response = self.audio_clip_queue_to_hit() #=================================================================== # elif state == "Hit": # print("In hit: %s"%audio_clip_url) #=================================================================== def audio_clip_queue_to_hit(self,cost_sensitive=True): """Take queued audio clips from the audio clip queue put them in a hit and create the hit. If successful, update the audio clip state.""" clip_queue = self.mh.get_audio_clip_queue() clip_pairs = self.mh.get_audio_clip_pairs(clip_queue) if clip_pairs: hit_title = "Audio Transcription" question_title = "List and Transcribe" description = "Transcribe the audio clip by typing the words the person says in order." keywords = "audio, transcription, audio transcription" if cost_sensitive: reward_per_clip = 0.02 max_assignments = 3 estimated_cost = self.hh.estimate_html_HIT_cost(clip_pairs,reward_per_clip,max_assignments) clips_in_hits = self.mh.clips_already_in_hit(clip_pairs) if clips_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_audio_clips_from_queue(clips_in_hits) elif self.balance - estimated_cost >= 250: #if we have enough money, create the HIT response = self.hh.make_html_transcription_HIT(clip_pairs,hit_title, question_title, description, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len(response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_audio_clips_from_queue(clip_queue) audio_clip_ids = [w["audio_clip_id"] for w in clip_queue] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_transcription_hit_artifact(hit_id,hit_type_id,clip_queue,"New") self.logger.info("Successfully created HIT: %s"%hit_id) return self.mh.update_audio_clips_state(audio_clip_ids,"Hit") else: pass return False def load_assignments_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_ids.append(assignment.AssignmentId) if self.mh.get_artifact("assignments",{"_id":assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue else: have_all_assignments = False transcription_ids = [] transcription_dicts = self.ah.get_assignment_submitted_transcriptions(assignment) if transcription_dicts and len(transcription_dicts)==10: pass for transcription in transcription_dicts: if not self.mh.get_artifact_by_id("audio_clips",transcription["audio_clip_id"]): self.logger.info("Assignment(%s) with unknown audio clip(%s) skipped"%\ (assignment.AssignmentId,transcription["audio_clip_id"])) break self.mh.update_transcription_state(transcription,"Submitted") self.mh.update_audio_clips_state([transcription["audio_clip_id"]], "Submitted") transcription_ids.append(self.mh.get_artifact("transcriptions",{"audio_clip_id" : transcription["audio_clip_id"], "assignment_id" : transcription["assignment_id"]}, "_id")) else: self.mh.create_assignment_artifact(assignment, transcription_ids, "Submitted") if assignments and not have_all_assignments: self.mh.update_transcription_hit_state(hit_id,"Submitted") print("Transcriptions HIT(%s) submitted assignments: %s "%(hit_id,assignment_ids)) def assignment_submitted_approved(self): """For all submitted assignments, if an answered question has a reference transcription, check the WER. If all the answered questions with reference transcriptions have an acceptable WER, approve the assignment and update the audio clips and transcriptions.""" assignments = self.mh.get_artifacts_by_state("assignments", "Submitted") rejected_feedback = "I'm sorry but your work in assignment(%s) was rejected because" +\ " one or more of your transcriptions " +\ " had a word error rate above the maximum acceptable"+\ " word error rate of %s. Omitted words and words that "+\ " differed by more than %s "+\ " characters were counted as an error." accepted_feedback = "Your average word error rate on assignment(%s) was %s."+\ " Assignment accepted! Thanks for your hard work." for assignment in assignments: assignment_id = assignment["_id"] transcription_ids = assignment["transcriptions"] transcriptions = self.mh.get_artifacts("transcriptions","_id",transcription_ids) worker_id = assignment["worker_id"] worker_id = self.mh.create_worker_artifact(worker_id) approved, average_wer = self.filter.approve_assignment(transcriptions) if approved: try: self.conn.approve_assignment(assignment_id, accepted_feedback%(assignment_id,average_wer)) except MTurkRequestError as e: print(e) else: self.mh.update_assignment_state(assignment,"Approved") for transcription in transcriptions: #Approve transcriptions without references in the same assignment reference_id = self.mh.get_artifact_by_id("audio_clips",transcription["audio_clip_id"],"reference_transcription_id") if not reference_id: self.mh.update_transcription_state(transcription,"Approved") print("Approved transcription ids: %s"%transcription_ids) else: #Don't deny for now feedback = rejected_feedback%(assignment_id,self.filter.WER_THRESHOLD,self.filter.CER_THRESHOLD) self.logger.info(feedback) self.conn.reject_assignment(assignment_id,feedback) self.mh.update_assignment_state(assignment,"Denied") #print("Assignments not aproved %s "%denied) #Update the worker if approved: self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer)) def _load_rm_audio_source_file_to_clipped(self,file_dir,prompt_file_uri, base_clip_dir,sample_rate=16000, http_base_url = "http://www.cis.upenn.edu/~tturpen/wavs/", init_clip_count = 200): """For an audio directory, see which files are new and not an audio source already """ prompt_dict = self.ph.get_prompts(prompt_file_uri) count = 0 for root, dirs, files in os.walk(file_dir): for f in files: if count == init_clip_count: return system_uri = os.path.join(root,f) out_uri = system_uri.strip(".sph") + ".wav" out_uri = os.path.basename(out_uri) out_uri = os.path.join(root,(out_uri)) spkr_id = str(os.path.relpath(root,file_dir)) #sph to wav if not f.endswith(".wav") and not os.path.exists(out_uri): try: self.wh.sph_to_wav(system_uri,out_uri=out_uri) except WavHandlerException as e: self.logger.error("Unable to create wav from sph: "+str(e)) if os.path.exists(out_uri) and out_uri.endswith(".wav"): #create audio source artifact count += 1 wav_filename = os.path.basename(out_uri) prompt_id = os.path.basename(out_uri).strip(".wav").upper() encoding = ".wav" sample_rate = 16000 disk_space = os.stat(out_uri).st_size length_seconds = self.wh.get_audio_length(out_uri) if prompt_id in prompt_dict: transcription_prompt = prompt_dict[prompt_id] else: #No prompt found raise PromptNotFound source_id = self.mh.create_audio_source_artifact(out_uri, disk_space, length_seconds, sample_rate, spkr_id, encoding) #create audio clip artifact audio_clip_uri = os.path.join(base_clip_dir,spkr_id,wav_filename) clip_dir = os.path.dirname(audio_clip_uri) if not os.path.exists(clip_dir): os.makedirs(clip_dir) if not os.path.exists(audio_clip_uri): copyfile(out_uri,audio_clip_uri) #http_url http_url = os.path.join(http_base_url,spkr_id,wav_filename) clip_id = self.mh.create_audio_clip_artifact(source_id, 0, -1, audio_clip_uri, http_url, length_seconds, disk_space) #Update the audio source, updates state too self.mh.update_audio_source_audio_clip(source_id,clip_id) #Create the reference transcription artifact transcription_id = self.mh.create_reference_transcription_artifact(clip_id, transcription_prompt, "Gold") #Completes audio clip to Referenced self.mh.update_audio_clip_reference_transcription(clip_id,transcription_id) def all_workers_liveness(self): workers = self.mh.get_all_workers() for worker in workers: worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments(worker) print("Worker(%s) assignments, approved(%s) denied(%s)"%(worker["_id"],approved,denied)) selection = input("1. Show denied transcriptions and references.\n"+ "2. Show accepted transcriptions and references.\n"+ "3. Show both denied and accepted transcriptions.") if selection == 1 or selection == 3: print("Approved transcriptions") for assignment_id in approved: transcription_pairs = self.mh.get_transcription_pairs(assignment_id) for pair in transcription_pairs: print ("Reference:\n\t%s\nHypothesis:\n\t%s\n"%(pair[0],pair[1])) if selection == 2 or selection == 3: print("Denied transcriptions") for assignment_id in denied: transcription_pairs = self.mh.get_transcription_pairs(assignment_id) for pair in transcription_pairs: print ("Reference:\n\t%s\nHypothesis:\n\t%s\n"%(pair[0],pair[1])) def stats(self): workers = self.mh.get_all_workers() all_wer_per_approved_assignment = 0.0 total_accepted = 0.0 for worker in workers: worker_wer = 0.0 worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments_wer(worker) for w in approved: all_wer_per_approved_assignment += float(w[1]) worker_wer += float(w[1]) total_accepted += 1 if approved: worker_average_wer = worker_wer/len(approved) print("%s,%s"%(len(approved),worker_average_wer)) #print("Worker(%s) approved assignments(%s)\n denied assignments(%s)"%(worker_id,approved,denied)) av = all_wer_per_approved_assignment/total_accepted print("Average WER per assignment(%s)"%(av)) def get_assignment_stats(self): self.effective_hourly_wage_for_approved_assignments(.20) def effective_hourly_wage_for_approved_assignments(self,reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state("assignments","Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: if "SubmitTime" in assignment: accepted = datetime.datetime.strptime(assignment["AcceptTime"],"%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime(assignment["SubmitTime"],"%Y-%m-%dT%H:%M:%SZ") else: pass total += submitted-accepted count += 1 seconds_per_assignment = total.total_seconds()/count effective_hourly_wage = 60.0*60.0/seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s"%(seconds_per_assignment,reward_per_assignment,effective_hourly_wage)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() for hit in hits: hit_id = hit.HITId print("HIT ID: %s"%hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)") == "y": try: self.conn.disable_hit(hit_id) clips = self.mh.get_artifact("transcription_hits",{"_id": hit_id},"clips") self.mh.remove_transcription_hit(hit_id) self.mh.update_audio_clips_state(clips, "Referenced") except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)"%len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/ind_trn" #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" base_clip_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/clips" selection = 0 init_clip_count = 10000 while selection != "11": selection = raw_input("""Audio Source file to Audio Clip Approved Pipeline:\n 1: AudioSource-FileToClipped: Initialize Resource Management audio source files to %d queueable(Referenced) clips 2: AudioClip-ReferencedToHit: Queue all referenced audio clips and create a HIT if the queue is full. 3: AudioClip-HitToSubmitted: Check all submitted assignments for Transcriptions. 4: AudioClip-SubmittedToApproved: Check all submitted clips against their reference. 5: Review Current Hits 6: Worker liveness 7: Account balance 8: Worker stats 9: Recalculate worker WER 10: Assignment Stats 11: Exit """%init_clip_count) #selection = "5" if selection == "1": self._load_rm_audio_source_file_to_clipped(audio_file_dir, prompt_file_uri, base_clip_dir,init_clip_count=init_clip_count) elif selection == "2": self.audio_clip_referenced_to_hit() elif selection == "3": self.load_assignments_hit_to_submitted() elif selection == "4": self.assignment_submitted_approved() elif selection == "5": self.allhits_liveness() elif selection == "6": self.all_workers_liveness() elif selection == "7": print("Account balance: %s"%self.balance) elif selection == "8": self.stats() elif selection == "9": self.recalculate_worker_assignment_wer() elif selection == "10": self.get_assignment_stats() # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime) # def recalculate_worker_assignment_wer(self): # """For all submitted assignments, # if an answered question has a reference transcription, # check the WER. # If all the answered questions with reference transcriptions # have an acceptable WER, approve the assignment and update # the audio clips and transcriptions.""" # assignments = self.mh.get_artifacts("assignments",{"state":"Approved"}) # for assignment in assignments: # assignment_id = assignment["_id"] # denied = [] # #If no transcriptions have references then we automatically approve the HIT # approved = True # transcription_ids = assignment["transcriptions"] # transcriptions = self.mh.get_transcriptions("_id",transcription_ids) # worker_id = assignment["worker_id"] # worker_id = self.mh.create_worker_artifact(worker_id) # # max_rej_wer = (0.0,0.0) # total_wer = 0.0 # for transcription in transcriptions: # #Normalize the transcription # #self.mh.normalize_transcription # reference_id = self.mh.get_audio_clip_by_id(transcription["audio_clip_id"],"reference_transcription_id") # if reference_id: # reference_transcription = self.mh.get_reference_transcription({"_id": reference_id}, # "transcription") # new_transcription = transcription["transcription"].split(" ") # if reference_transcription: # transcription_wer = cer_wer(reference_transcription,new_transcription) # total_wer += transcription_wer # if transcription_wer < WER_THRESHOLD: # self.logger.info("WER for transcription(%s) %d"%(transcription["transcription"],transcription_wer)) # else: # max_rej_wer = (transcription_wer,WER_THRESHOLD) # denied.append((reference_transcription,new_transcription)) # approved = False # average_wer = total_wer/len(transcriptions) # #Update the worker # self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer))
class MturkHelper(object): """ This class handles task creation for amazon mechanical task service. Amazon MTruk is used to crowdsource matching products. Initialisation : - reference : reference of the product - osm_from : the origin osm of a product - osm_to : the osm to look into """ if settings.SANDBOX: AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID else: AWS_SECRET_ACCESS_KEY = 'e6/8e5lcCcESPKT/fe6kYkJtf0+7F2w7459WTJ0v' AWS_ACCESS_KEY_ID = 'AKIAIP5JQO7FQX6Q7JAQ' def __init__(self, reference=None, osm_from=None, osm_to=None, key=None, hitid=None): self.reference = reference self.osm_from = osm_from self.osm_to = osm_to self.key = key self.hitid = hitid if key is None: self.task = None else: self.task = self.get_task() self.mtc = MTurkConnection( aws_access_key_id=MturkHelper.AWS_ACCESS_KEY_ID, aws_secret_access_key=MturkHelper.AWS_SECRET_ACCESS_KEY, host=settings.HOST) def get_all_reviewable_hits(self): page_size = 50 hits = self.mtc.get_reviewable_hits(page_size=page_size) print "Total results to fetch %s " % hits.TotalNumResults print "Request hits page %i" % 1 total_pages = float(hits.TotalNumResults) / page_size int_total = int(total_pages) if (total_pages - int_total > 0): total_pages = int_total + 1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = self.mtc.get_reviewable_hits(page_size=page_size, page_number=pn) hits.extend(temp_hits) return hits def get_hits(self, validate=False, all_hits=False): if not all_hits: hits = self.get_all_reviewable_hits() else: hits = self.mtc.get_all_hits() for hit in hits: print "####################" print "--------------------" print "HitId = %s" % (hit.HITId) assignments = self.mtc.get_assignments(hit.HITId) # Getting task associated to hit task = Task.objects.filter(hitId=hit.HITId) print 'Number of corresponding tasks = %d' % len(task) if len(task) > 0: task = task[0] else: task = None for assignment in assignments: print "AssignmentId = %s" % (assignment.AssignmentId) print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: qid = question_form_answer.qid if qid == 'flagged': for value in question_form_answer.fields: # Saving resultTask if task is not None: print 'Saving result task, result = %s' % ( value) resulttask, created = ResultTask.objects.get_or_create( task=task, assignementId=assignment.AssignmentId, workerId=assignment.WorkerId) resulttask.reference = value resulttask.save() elif validate: try: self.mtc.approve_assignment( assignment.AssignmentId) except Exception, e: print e try: if validate: self.mtc.disable_hit(hit.HITId) except Exception, e: print e print "--------------------"
#!/home/dave/anaconda2/bin/python import sys sys.path.append( '/home/dave/OneDrive/Research/By Project/Dissertation/experiments/private/' ) from boto.mturk.connection import MTurkConnection from awsKeys import aws_access_key_id from awsKeys import aws_secret_access_key HOST = 'mechanicalturk.sandbox.amazonaws.com' # Use this to post to the sandbox instead mtc = MTurkConnection(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=HOST) hit_ids = [] for h in mtc.get_all_hits(): hit_ids.append(h.HITId) for h in hit_ids: mtc.disable_hit(h) print mtc.get_hit(h)[0].HITId + ': ' + mtc.get_hit(h)[0].HITStatus
if(total_pages-int_total>0): total_pages = int_total+1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = mtc.get_reviewable_hits(page_size=page_size,page_number=pn) hits.extend(temp_hits) return hits mtc = MTurkConnection(aws_access_key_id='llllllllllllllllllllllllllllllllllllll', aws_secret_access_key='oooooooooooooooooooooooooooooooooooo', host='mechanicalturk.sandbox.amazonaws.com') hits = get_all_reviewable_hits(mtc) for hit in hits: assignments = mtc.get_assignments(hit.HITId) mtc.disable_hit(hit.HITId, response_groups=None) for assignment in assignments: print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers: for element in question_form_answer: for value in element.fields: print "%s" % (value) print "------------------------------------------------"
def cleanup(): """Remove any boto test related HIT's""" conn = MTurkConnection(host='mechanicalturk.sandbox.amazonaws.com') current_page = 1 page_size = 10 total_disabled = 0 ignored = [] while True: # reset the total for this loop disabled_count = 0 # search all the hits in the sandbox search_rs = conn.search_hits(page_size=page_size, page_number=current_page) # success? if search_rs.status: for hit in search_rs: # delete any with Boto in the description print 'hit id:%s Status:%s, desc:%s' %(hit.HITId, hit.HITStatus, hit.Description) if hit.Description.find('Boto') != -1: if hit.HITStatus != 'Reviewable': print 'Disabling hit id:%s %s' %(hit.HITId, hit.Description) disable_rs = conn.disable_hit(hit.HITId) if disable_rs.status: disabled_count += 1 # update the running total total_disabled += 1 else: print 'Error when disabling, code:%s, message:%s' %(disable_rs.Code, disable_rs.Message) else: print 'Disposing hit id:%s %s' %(hit.HITId, hit.Description) dispose_rs = conn.dispose_hit(hit.HITId) if dispose_rs.status: disabled_count += 1 # update the running total total_disabled += 1 else: print 'Error when disposing, code:%s, message:%s' %(dispose_rs.Code, dispose_rs.Message) else: if hit.HITId not in ignored: print 'ignored:%s' %hit.HITId ignored.append(hit.HITId) # any more results? if int(search_rs.TotalNumResults) > current_page*page_size: # if we have disabled any HITs on this page # then we don't need to go to a new page # otherwise we do if not disabled_count: current_page += 1 else: # no, we're done break else: print 'Error performing search, code:%s, message:%s' %(search_rs.Code, search_rs.Message) break total_ignored = len(ignored) print 'Processed: %d HITs, disabled/disposed: %d, ignored: %d' %(total_ignored + total_disabled, total_disabled, total_ignored)
class MturkHelper(object): """ This class handles task creation for amazon mechanical task service. Amazon MTruk is used to crowdsource matching products. Initialisation : - reference : reference of the product - osm_from : the origin osm of a product - osm_to : the osm to look into """ if settings.SANDBOX: AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID else: AWS_SECRET_ACCESS_KEY = 'e6/8e5lcCcESPKT/fe6kYkJtf0+7F2w7459WTJ0v' AWS_ACCESS_KEY_ID = 'AKIAIP5JQO7FQX6Q7JAQ' def __init__(self, reference = None, osm_from = None, osm_to = None, key = None, hitid = None): self.reference = reference self.osm_from = osm_from self.osm_to = osm_to self.key = key self.hitid = hitid if key is None: self.task = None else: self.task = self.get_task() self.mtc = MTurkConnection(aws_access_key_id=MturkHelper.AWS_ACCESS_KEY_ID, aws_secret_access_key=MturkHelper.AWS_SECRET_ACCESS_KEY, host=settings.HOST) def get_all_reviewable_hits(self): page_size = 50 hits = self.mtc.get_reviewable_hits(page_size=page_size) print "Total results to fetch %s " % hits.TotalNumResults print "Request hits page %i" % 1 total_pages = float(hits.TotalNumResults)/page_size int_total= int(total_pages) if(total_pages-int_total>0): total_pages = int_total+1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = self.mtc.get_reviewable_hits(page_size=page_size,page_number=pn) hits.extend(temp_hits) return hits def get_hits(self, validate = False, all_hits = False): if not all_hits: hits = self.get_all_reviewable_hits() else: hits = self.mtc.get_all_hits() for hit in hits: print "####################" print "--------------------" print "HitId = %s"%(hit.HITId) assignments = self.mtc.get_assignments(hit.HITId) # Getting task associated to hit task = Task.objects.filter(hitId = hit.HITId) print 'Number of corresponding tasks = %d'%len(task) if len(task)>0: task = task[0] else: task = None for assignment in assignments: print "AssignmentId = %s"%(assignment.AssignmentId) print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: qid = question_form_answer.qid if qid == 'flagged': for value in question_form_answer.fields: # Saving resultTask if task is not None: print 'Saving result task, result = %s'%(value) resulttask, created = ResultTask.objects.get_or_create(task = task, assignementId = assignment.AssignmentId, workerId = assignment.WorkerId) resulttask.reference = value resulttask.save() elif validate: try: self.mtc.approve_assignment(assignment.AssignmentId) except Exception, e: print e try: if validate: self.mtc.disable_hit(hit.HITId) except Exception, e: print e print "--------------------"
class Mturk(): def __init__(self): self.config = self.set_config() self.mturk = MTurkConnection( aws_access_key_id=self.config['aws_access_key_id'], aws_secret_access_key=self.config['aws_secret_access_key'], host=self.config['host']) self.mturk_tmpl = MturkTmpl() def set_config(self, config_path="config.yml"): with open(config_path, 'r') as file: config = yaml.load(file) return config def account_balance(self): account_balance = self.mturk.get_account_balance() print("Testing connection: You have a balance of: {}".format( account_balance)) def get_hits(self): return self.mturk.get_all_hits() def get_all_assignments(self, hit_id): page_size = 100 assignments = self.mturk.get_assignments(hit_id, page_size=page_size) total_records = int(assignments.TotalNumResults) get_page_assignments = lambda page: self.mturk.get_assignments( hit_id, page_size=page_size, page_number=page) page_nums = self.mturk._get_pages(page_size=page_size, total_records=total_records) assignments_sets = itertools.imap(get_page_assignments, page_nums) return itertools.chain.from_iterable(assignments_sets) def remove_old_hits(self): # Disable old hits. for hit in self.get_hits(): print("Hit {} has been removed.".format(hit.HITId)) self.mturk.disable_hit(hit.HITId) def cal_reward(self, data): read_instruction = 3.0 word_count = len(data['ents']) * 1 / 30.0 return round((read_instruction + word_count) / 60.0 * 6.0, 2) def create_hit(self, data): # These parameters define the HIT that will be created # question is what we defined above # max_assignments is the # of unique Workers you're requesting # title, description, and keywords help Workers find your HIT # duration is the # of seconds Workers have to complete your HIT # reward is what Workers will be paid when you approve their work # Check out the documentation on CreateHIT for more details response = self.mturk.create_hit( question=self.mturk_tmpl.html_question(data), max_assignments=1, title=self.config['title'], description=self.config['description'], keywords=self.config['keywords'], duration=120, reward=self.cal_reward(data)) return response
for i in xrange(NUMBEROFWORKERPOOLS): f = open('log/aql' + str(i),'w') f.write('') f.close() chmod('log/aql' +str(i),0o777) # f = open('log/aql0', 'w') # f.write('') # f.close() # chmod('log/aql0', 0o777) # # f = open('log/aql1', 'w') # f.write('') # f.close() # chmod('log/aql1', 0o777) if not SIMULATION: if SANDBOX: mturk = MTurkConnection(AWSAKID, AWSSAK, host='mechanicalturk.sandbox.amazonaws.com') else: mturk = MTurkConnection(AWSAKID, AWSSAK, host='mechanicalturk.amazonaws.com') for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId)
print "Request hits page %i" % 1 total_pages = float(hits.TotalNumResults)/page_size int_total= int(total_pages) if(total_pages-int_total>0): total_pages = int_total+1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = mtc.get_reviewable_hits(page_size=page_size,page_number=pn) hits.extend(temp_hits) return hits mtc = MTurkConnection(aws_access_key_id='SSSSSSSSSSSSSSSSSSSSSS', aws_secret_access_key='Vkkkkkkkkkkkkkkkkkkkkkkkkkkk', host='mechanicalturk.sandbox.amazonaws.com') hits = get_all_reviewable_hits(mtc) #expire a HIT for hit in hits: print (hit) mtc.disable_hit(hit, response_groups=None) else: print ("Successfully Expired all the reviewable HITs")
host=HOST) url = "https://mturk-poc.herokuapp.com/" title = "Describe this group of people in your own words" description = "Describe your first impressions of this group of people however you want." keywords = ["easy"] frame_height = 800 amount = 0.05 questionform = ExternalQuestion(url, frame_height) all_hits = [hit for hit in connection.get_all_hits()] if all_hits: for hit in all_hits: connection.disable_hit(hit.HITId) create_hit_result = connection.create_hit( title=title, description=description, keywords=keywords, max_assignments=4, lifetime=datetime.timedelta(hours=2), question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), ) all_hits = [hit for hit in connection.get_all_hits()] for hit in all_hits:
def processHITs(verbose=True, approveAll=False, deleteAll=False, insertComparisons=False): mtc = MTurkConnection(host=_host) hits = getReviewableHITs(verbose) # store hit info here, for persistence _hits_vector = [] _rejected_hits = [] _flagged_hits = [] # stats variables worker_ids = set() for hit in hits: assignments = mtc.get_assignments(hit.HITId, page_size=50) for assignment in assignments: worker_ids.add(assignment.WorkerId) if verbose: print "Answers of the worker: [%s]" % assignment.WorkerId _worker_id = '' _worker_exp = 0 _hit_id = 0 _assignment_id = '' _gui_rating = '' _hit_comment = '' _hit_rt = 0 _hit_it = 0 _trials_results = '' _hit_interactions_str = '' _hit_reject_flag = False _hit_flag = False for question_form_answer in assignment.answers[0]: key = question_form_answer.qid value = question_form_answer.fields if key == '_worker_id': _worker_id = value[0] if verbose: print " - Worker ID: [%s]" % (_worker_id) elif key == '_worker_exp': _worker_exp = int(value[0]) if verbose: print " - Worker experience: [%d]" % (_worker_exp) elif key == '_hit_id': _hit_id = int(value[0]) if verbose: print " - HIT ID: [%d]" % (_hit_id) elif key == '_assignment_id': _assignment_id = value[0] if verbose: print " - Assignment ID: [%s]" % (_assignment_id) elif key == '_gui_rating': _gui_rating = value[0] try: _gui_rating = int(_gui_rating) except ValueError: _gui_rating = -1 if verbose: print " - GUI rating: [%d/10]" % (_gui_rating) elif key == '_hit_comment': _hit_comment = value[0] if verbose: print " - HIT comment: [%s]" % (_hit_comment) elif key == '_hit_rt': _hit_rt = int(value[0]) if verbose: print " - HIT response time: [%d]" % (_hit_rt) elif key == '_hit_it': _hit_it = int(value[0]) if verbose: print " - HIT instruction time: [%d]" % (_hit_it) elif key == '_trials_results': _trials_results = value[0] if verbose: print " - All HIT's trials results: [%s]" % ( _trials_results) elif key == '_hit_interactions_str': _hit_interactions_str = value[0] if verbose: print " - HIT interactions string: [%s]" % ( _hit_interactions_str) elif key == '_hit_reject_flag': _hit_reject_flag = value[0] if str(_hit_reject_flag) == 'false': _hit_reject_flag = False else: _hit_reject_flag = True if verbose: print " - HIT reject flag: [%s]" % ( str(_hit_reject_flag)) elif key == '_hit_flag': _hit_flag = value[0] if _hit_flag == 'Yes': _hit_flag = True else: _hit_flag = False if verbose: print " - HIT information flag: [%s]" % ( str(_hit_flag)) else: print "<----------------------------->" print "ERROR: unknown key [%r]" % (key, ) print "Relevant info:" pprint(vars(assignment)) pprint(vars(question_form_answer)) print "Exiting..." print "<----------------------------->" return #if insertComparisons: # pass # insert the comparisons into the database _hit_data = assignment.__dict__.copy() del _hit_data['answers'] _hit_data['_worker_id'] = _worker_id _hit_data['_worker_exp'] = _worker_exp _hit_data['_hit_id'] = _hit_id _hit_data['_assignment_id'] = _assignment_id _hit_data['_gui_rating'] = _gui_rating _hit_data['_hit_comment'] = _hit_comment _hit_data['_hit_rt'] = _hit_rt _hit_data['_hit_it'] = _hit_it _hit_data['_trials_results'] = _trials_results _hit_data['_hit_interactions_str'] = _hit_interactions_str _hit_data['_hit_reject_flag'] = _hit_reject_flag _hit_data['_hit_flag'] = _hit_flag _hits_vector.append(_hit_data) if _hit_reject_flag: _rejected_hits.append(_hit_data) print "<----------------------------->" print "This HIT is low quality - Will be rejected." print "Relevant info:" pprint(vars(assignment)) for question_form_answer in assignment.answers[0]: pprint(vars(question_form_answer)) print "<----------------------------->" try: mtc.reject_assignment(assignment.AssignmentId) except MTurkRequestError: print "Could not reject [%s]" % (assignment.AssignmentId) else: if _hit_flag: _flagged_hits.append(_hit_data) print "<----------------------------->" print "This HIT has been flagged by turker." print "Relevant info:" pprint(vars(assignment)) for question_form_answer in assignment.answers[0]: pprint(vars(question_form_answer)) print "<----------------------------->" if approveAll: try: mtc.approve_assignment(assignment.AssignmentId) except MTurkRequestError: print "Could not approve [%s]" % ( assignment.AssignmentId) if verbose: print "<----------------------------->" if deleteAll: mtc.disable_hit(hit.HITId) # print out some stats print "Number of HITs = [%d]" % (len(_hits_vector), ) print "Number of distinct workers = [%d]" % (len(worker_ids), ) print "Number of rejected HITs = [%d]" % (len(_rejected_hits), ) print "Number of flagged HITs = [%d]" % (len(_flagged_hits), ) return_dict = { "_all_hits": _hits_vector, "_rejected_hits": _rejected_hits, "_flagged_hits": _flagged_hits } if 'MTURK_STORAGE_PATH' in os.environ: time_stamp = time.strftime("%Y-%m-%d_%H-%M-%S") hit_name = "completed_cocoa_5000" filename = os.path.join(os.environ['MTURK_STORAGE_PATH'], hit_name + '_' + time_stamp + ".pkl") print "Storing collected hit data at %s" % (filename) with open(filename, 'wb') as f: pickle.dump(return_dict, f) else: print "WARNING: MTURK_STORAGE_PATH not set in env. Unable to save hit data." return return_dict
total_pages = int_total + 1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = mtc.get_reviewable_hits(page_size=page_size, page_number=pn) hits.extend(temp_hits) return hits mtc = MTurkConnection( aws_access_key_id='llllllllllllllllllllllllllllllllllllll', aws_secret_access_key='oooooooooooooooooooooooooooooooooooo', host='mechanicalturk.sandbox.amazonaws.com') hits = get_all_reviewable_hits(mtc) for hit in hits: assignments = mtc.get_assignments(hit.HITId) mtc.disable_hit(hit.HITId, response_groups=None) for assignment in assignments: print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers: for element in question_form_answer: for value in element.fields: print "%s" % (value) print "------------------------------------------------"
class MTurkProvider(object): description = 'This is a task authored by a requester on Daemo, a research crowdsourcing platform. ' \ 'Mechanical Turk workers are welcome to do it' keywords = ['daemo'] countries = ['US', 'CA'] min_hits = 1000 def __init__(self, host, aws_access_key_id, aws_secret_access_key): self.host = host self.connection = MTurkConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=settings.MTURK_HOST ) self.connection.APIVersion = "2014-08-15" if not self.host: raise ValueError("Please provide a host url") def get_connection(self): return self.connection @staticmethod def _mturk_system_qualifications(qualification): requirements = [] for item in qualification.items.all(): if item.expression['attribute'] not in ['location', 'approval_rate', 'total_tasks']: continue requirement = None if item.expression['attribute'] == 'location': op = OP_IN if item.expression['operator'] == 'in' else OP_NOT_IN requirement = MultiLocaleRequirement(op, [val.strip() for val in item.expression['value'] if val is not None and val != '']) elif item.expression['attribute'] == 'approval_rate': op = OP_GT if item.expression['operator'] == 'gt' else OP_LT requirement = PercentAssignmentsApprovedRequirement(op, item.expression['value']) elif item.expression['attribute'] == 'total_tasks': op = OP_GT if item.expression['operator'] == 'gt' else OP_LT requirement = NumberHitsApprovedRequirement(op, item.expression['value']) requirements.append(requirement) return requirements def get_qualifications(self, project, boomerang_threshold, add_boomerang): requirements = [] if project.qualification is not None: requirements += self._mturk_system_qualifications(project.qualification) boomerang_qual, success = self.create_qualification_type(owner_id=project.owner_id, project_id=project.group_id, name='Boomerang Score #{}'.format(project.group_id), flag=FLAG_Q_BOOMERANG, description='No description available') boomerang = None if boomerang_threshold <= int(settings.BOOMERANG_MIDPOINT * 100): for i, bucket in enumerate(WAIT_LIST_BUCKETS): if int(bucket[1] * 100) <= boomerang_threshold: boomerang_blacklist, success = \ self.create_qualification_type(owner_id=project.owner_id, name='Boomerang Waitlist #{}-{}'.format(project.group_id, len( WAIT_LIST_BUCKETS) - i), flag=FLAG_Q_BOOMERANG, description='No description available', deny=True, project_id=project.group_id, bucket=bucket) if success and add_boomerang: boomerang = BoomerangRequirement(qualification_type_id=boomerang_blacklist.type_id, comparator=OP_DNE, integer_value=None) requirements.append(boomerang) else: boomerang = BoomerangRequirement(qualification_type_id=boomerang_qual.type_id, comparator=OP_GTEQ, integer_value=boomerang_threshold) if success and add_boomerang: requirements.append(boomerang) return Qualifications(requirements), boomerang_qual def create_hits(self, project, tasks=None, repetition=None): # if project.min_rating > 0: # return 'NOOP' if not tasks: cursor = connection.cursor() # noinspection SqlResolve query = ''' SELECT max(id) id, repetition, group_id, repetition - sum(existing_assignments) remaining_assignments, min_rating FROM ( SELECT t_rev.id, t.group_id, t.min_rating, p.repetition, CASE WHEN ma.id IS NULL OR ma.status IN (%(skipped)s, %(rejected)s, %(expired)s) THEN 0 ELSE 1 END existing_assignments FROM crowdsourcing_task t INNER JOIN crowdsourcing_project p ON t.project_id = p.id INNER JOIN crowdsourcing_task t_rev ON t_rev.group_id = t.group_id LEFT OUTER JOIN mturk_mturkhit mh ON mh.task_id = t_rev.id LEFT OUTER JOIN mturk_mturkassignment ma ON ma.hit_id = mh.id WHERE t.project_id = (%(project_id)s) AND t_rev.exclude_at IS NULL AND t_rev.deleted_at IS NULL ) t GROUP BY group_id, repetition, min_rating HAVING sum(existing_assignments) < repetition; ''' cursor.execute(query, {'skipped': TaskWorker.STATUS_SKIPPED, 'rejected': TaskWorker.STATUS_REJECTED, 'expired': TaskWorker.STATUS_EXPIRED, 'project_id': project.id}) tasks = cursor.fetchall() rated_workers = Rating.objects.filter(origin_type=Rating.RATING_REQUESTER).count() add_boomerang = rated_workers > 0 duration = project.timeout if project.timeout is not None else datetime.timedelta(hours=24) lifetime = project.deadline - timezone.now() if project.deadline is not None else datetime.timedelta( days=7) for task in tasks: question = self.create_external_question(task[0]) mturk_hit = MTurkHIT.objects.filter(task_id=task[0]).first() qualifications, boomerang_qual = self.get_qualifications(project=project, boomerang_threshold=int( round(task[4], 2) * 100), add_boomerang=add_boomerang) qualifications_mask = 0 if qualifications is not None: qualifications_mask = FLAG_Q_LOCALE + FLAG_Q_HITS + FLAG_Q_RATE + FLAG_Q_BOOMERANG hit_type, success = self.create_hit_type(title=project.name, description=self.description, price=project.price, duration=duration, keywords=self.keywords, approval_delay=datetime.timedelta(days=2), qual_req=qualifications, qualifications_mask=qualifications_mask, boomerang_threshold=int(round(task[4], 2) * 100), owner_id=project.owner_id, boomerang_qual=boomerang_qual) if not success: return 'FAILURE' if mturk_hit is None: try: hit = self.connection.create_hit(hit_type=hit_type.string_id, max_assignments=task[3], lifetime=lifetime, question=question)[0] self.set_notification(hit_type_id=hit.HITTypeId) mturk_hit = MTurkHIT(hit_id=hit.HITId, hit_type=hit_type, task_id=task[0]) except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.MechanicalTurk.InsufficientFunds': message = { "type": "ERROR", "detail": "Insufficient funds on your Mechanical Turk account!", "code": error } redis_publisher = RedisPublisher(facility='bot', users=[project.owner]) message = RedisMessage(json.dumps(message)) redis_publisher.publish_message(message) return 'FAILED' else: if mturk_hit.hit_type_id != hit_type.id: result, success = self.change_hit_type_of_hit(hit_id=mturk_hit.hit_id, hit_type_id=hit_type.string_id) if success: mturk_hit.hit_type = hit_type mturk_hit.save() return 'SUCCESS' def create_hit_type(self, owner_id, title, description, price, duration, boomerang_threshold, keywords=None, approval_delay=None, qual_req=None, qualifications_mask=0, boomerang_qual=None): hit_type = MTurkHITType.objects.filter(owner_id=owner_id, name=title, description=description, price=Decimal(str(price)), duration=duration, qualifications_mask=qualifications_mask, boomerang_threshold=boomerang_threshold).first() if hit_type is not None: return hit_type, True reward = Price(price) try: mturk_ht = self.connection.register_hit_type(title=title, description=description, reward=reward, duration=duration, keywords=keywords, approval_delay=approval_delay, qual_req=qual_req)[0] hit_type = MTurkHITType(owner_id=owner_id, name=title, description=description, price=Decimal(str(price)), keywords=keywords, duration=duration, qualifications_mask=qualifications_mask, boomerang_qualification=boomerang_qual, boomerang_threshold=boomerang_threshold) hit_type.string_id = mturk_ht.HITTypeId hit_type.save() except MTurkRequestError: return None, False return hit_type, True def create_external_question(self, task, frame_height=800): task_hash = Hashids(salt=settings.SECRET_KEY, min_length=settings.ID_HASH_MIN_LENGTH) task_id = task_hash.encode(task) url = self.host + '/mturk/task/?taskId=' + task_id question = ExternalQuestion(external_url=url, frame_height=frame_height) return question def update_max_assignments(self, task): task = Task.objects.get(id=task['id']) mturk_hit = task.mturk_hit if not mturk_hit: raise MTurkHIT.DoesNotExist("This task is not associated to any mturk hit") assignments_completed = task.task_workers.filter(~Q(status__in=[TaskWorker.STATUS_REJECTED, TaskWorker.STATUS_SKIPPED, TaskWorker.STATUS_EXPIRED])).count() remaining_assignments = task.project.repetition - assignments_completed if remaining_assignments > 0 and mturk_hit.num_assignments == mturk_hit.mturk_assignments. \ filter(status=TaskWorker.STATUS_SUBMITTED).count() and \ mturk_hit.mturk_assignments.filter(status=TaskWorker.STATUS_IN_PROGRESS).count() == 0: self.add_assignments(hit_id=mturk_hit.hit_id, increment=1) self.extend_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_IN_PROGRESS mturk_hit.num_assignments += 1 mturk_hit.save() elif remaining_assignments == 0: self.expire_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_EXPIRED mturk_hit.save() elif remaining_assignments > 0 and \ mturk_hit.status == MTurkHIT.STATUS_EXPIRED: self.extend_hit(hit_id=mturk_hit.hit_id) mturk_hit.status = MTurkHIT.STATUS_IN_PROGRESS return 'SUCCESS' def get_assignment(self, assignment_id): try: return self.connection.get_assignment(assignment_id)[0], True except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.MechanicalTurk.InvalidAssignmentState': return assignment_id, False return None, False def set_notification(self, hit_type_id): self.connection.set_rest_notification(hit_type=hit_type_id, url=self.host + '/api/mturk/notification', event_types=['AssignmentReturned', 'AssignmentAbandoned', 'AssignmentAccepted', 'AssignmentSubmitted']) def approve_assignment(self, task_worker): task_worker_obj = TaskWorker.objects.get(id=task_worker['id']) if hasattr(task_worker_obj, 'mturk_assignments') and task_worker_obj.mturk_assignments.first() is not None: try: self.connection.approve_assignment(task_worker_obj.mturk_assignments.first().assignment_id) except MTurkRequestError: return False return True def reject_assignment(self, task_worker): task_worker_obj = TaskWorker.objects.get(id=task_worker['id']) if hasattr(task_worker_obj, 'mturk_assignments') and task_worker_obj.mturk_assignments.first() is not None: try: self.connection.reject_assignment(task_worker_obj.mturk_assignments.first().assignment_id) except MTurkRequestError: return False return True def expire_hit(self, hit_id): try: self.connection.expire_hit(hit_id) except MTurkRequestError: return False return True def disable_hit(self, hit_id): try: self.connection.disable_hit(hit_id) except MTurkRequestError: return False return True def extend_hit(self, hit_id): try: self.connection.extend_hit(hit_id=hit_id, expiration_increment=604800) # 7 days except MTurkRequestError: return False return True def add_assignments(self, hit_id, increment=1): try: self.connection.extend_hit(hit_id=hit_id, assignments_increment=increment) except MTurkRequestError: return False return True def test_connection(self): try: return self.connection.get_account_balance()[0], True except MTurkRequestError as e: error = e.errors[0][0] if error == 'AWS.NotAuthorized': return None, False return None, False def get_account_balance(self): try: return self.connection.get_account_balance()[0] except MTurkRequestError: return None def create_qualification_type(self, owner_id, name, flag, description, project_id, auto_granted=False, auto_granted_value=None, deny=False, bucket=None): # noinspection SqlResolve query = ''' SELECT * FROM ( SELECT task.target_id, task.username, round(task.task_w_avg::NUMERIC, 2) rating --round(coalesce(task.task_w_avg, requester.requester_w_avg, -- platform.platform_w_avg)::NUMERIC, 2) rating FROM ( SELECT target_id, origin_id, project_id, username, sum(weight * power((%(BOOMERANG_TASK_ALPHA)s), t.row_number)) / sum(power((%(BOOMERANG_TASK_ALPHA)s), t.row_number)) task_w_avg FROM ( SELECT r.id, r.origin_id, p.group_id project_id, weight, r.target_id, -1 + row_number() OVER (PARTITION BY target_id ORDER BY tw.created_at DESC) AS row_number, u.username username FROM crowdsourcing_rating r INNER JOIN crowdsourcing_task t ON t.id = r.task_id INNER JOIN crowdsourcing_project p ON p.id = t.project_id INNER JOIN crowdsourcing_taskworker tw ON t.id = tw.task_id AND tw.worker_id=r.target_id INNER JOIN auth_user u ON u.id = r.target_id WHERE origin_id = (%(origin_id)s) AND origin_type = (%(origin_type)s)) t GROUP BY origin_id, target_id, project_id, username) task WHERE task.project_id = (%(project_id)s) ) r ''' extra_query = 'WHERE rating BETWEEN (%(lower_bound)s) AND (%(upper_bound)s);' params = { 'origin_type': Rating.RATING_REQUESTER, 'origin_id': owner_id, 'project_id': project_id, 'BOOMERANG_REQUESTER_ALPHA': settings.BOOMERANG_REQUESTER_ALPHA, 'BOOMERANG_PLATFORM_ALPHA': settings.BOOMERANG_PLATFORM_ALPHA, 'BOOMERANG_TASK_ALPHA': settings.BOOMERANG_TASK_ALPHA } obj_params = {'upper_bound': 300, 'lower_bound': 100} if deny and bucket is not None: query += extra_query params.update({'upper_bound': bucket[1], 'lower_bound': bucket[0]}) obj_params.update({'upper_bound': bucket[1] * 100, 'lower_bound': bucket[0] * 100, 'is_blacklist': True}) cursor = connection.cursor() cursor.execute(query, params=params) worker_ratings_raw = cursor.fetchall() worker_ratings = [{"worker_id": r[0], "worker_username": r[1], "rating": r[2]} for r in worker_ratings_raw] qualification = MTurkQualification.objects.filter(owner_id=owner_id, flag=flag, name=name).first() assigned_workers = [] if qualification is None: try: qualification_type = self.connection. \ create_qualification_type(name=name, description=description, status='Active', auto_granted=auto_granted, auto_granted_value=auto_granted_value)[0] qualification = MTurkQualification.objects.create(owner_id=owner_id, flag=flag, name=name, description=description, auto_granted=auto_granted, auto_granted_value=auto_granted_value, type_id=qualification_type.QualificationTypeId, **obj_params) except MTurkRequestError: return None, False else: assigned_workers = MTurkWorkerQualification.objects.values('worker').filter( qualification=qualification).values_list('worker', flat=True) for rating in worker_ratings: user_name = rating["worker_username"].split('.') if len(user_name) == 2 and user_name[0] == 'mturk': mturk_worker_id = user_name[1].upper() if mturk_worker_id not in assigned_workers: self.assign_qualification( qualification_type_id=qualification.type_id, worker_id=mturk_worker_id, value=int(rating['rating'] * 100)) defaults = { 'qualification': qualification, 'worker': mturk_worker_id, 'score': int(rating['rating'] * 100) } MTurkWorkerQualification.objects.update_or_create(qualification=qualification, worker=mturk_worker_id, defaults=defaults) return qualification, True def change_hit_type_of_hit(self, hit_id, hit_type_id): try: result = self.connection.change_hit_type_of_hit(hit_id=hit_id, hit_type=hit_type_id) except MTurkRequestError: return None, False return result, True def update_worker_boomerang(self, project_id, worker_id, task_avg, requester_avg): """ Update boomerang for project Args: project_id: worker_id: task_avg: requester_avg Returns: str """ hit = MTurkHIT.objects.select_related('hit_type__boomerang_qualification').filter( task__project__group_id=project_id).first() if hit is not None: qualification = hit.hit_type.boomerang_qualification worker_qual = MTurkWorkerQualification.objects.filter(qualification=qualification, worker=worker_id).first() if worker_qual is not None: self.update_score(worker_qual, score=int(task_avg * 100), override=True) else: MTurkWorkerQualification.objects.create(qualification=qualification, worker=worker_id, score=int(task_avg * 100), overwritten=True) self.assign_qualification(qualification_type_id=qualification.type_id, worker_id=worker_id, value=int(task_avg * 100)) # other_quals = MTurkWorkerQualification.objects.filter(~Q(qualification=qualification), # worker=worker_id, # overwritten=False) # for q in other_quals: # self.update_score(q, score=int(requester_avg * 100)) return 'SUCCESS' def update_score(self, worker_qual, score, override=False): if worker_qual is None: return False try: self.connection.update_qualification_score(worker_qual.qualification.type_id, worker_qual.worker, score) worker_qual.overwritten = override worker_qual.score = score worker_qual.save() except MTurkRequestError: return False return True def assign_qualification(self, qualification_type_id, worker_id, value=1): """ Revoke a qualification from a WorkerId Args: qualification_type_id: worker_id: value Returns: bool """ try: self.connection.assign_qualification(qualification_type_id, worker_id, value, send_notification=False) return True except MTurkRequestError: return False def revoke_qualification(self, qualification_type_id, worker_id): try: self.connection.revoke_qualification(qualification_type_id=qualification_type_id, subject_id=worker_id) return True except MTurkRequestError: return False def notify_workers(self, worker_ids, subject, message_text): try: self.connection.notify_workers(worker_ids, subject, message_text) return True except MTurkRequestError: return False
class MTurk(object): """ A class that wraps a boto.mturk.connection object and provides methods for the most common AI2 use cases """ def __init__(self, aws_access_key_id, aws_secret_access_key, host=SANDBOX_HOST): """ initializes the instance with AWS credentials and a host :param aws_access_key_id the access key id. :param aws_secret_access_key the secret access key. :param host the mturk host to connect to """ self.connection = MTurkConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=host) self.host = host def __del__(self): """ close the connection whenever this object goes out of scope """ self.connection.close() def get_account_balance(self): """ :return the balance on the mturk account """ return self.connection.get_account_balance()[0] def _create_hit(self, params, **kwargs): """ internal helper function for creating a HIT :param params the parameters (required and optional) common to all HITs :param **kwargs any other parameters needed for a specific HIT type :return the created HIT object """ return self.connection.create_hit( title=params["title"], description=params["description"], keywords=params["keywords"], max_assignments=params["max_assignments"], reward=Price(amount=params["amount"]), qualifications=params["qualifications"], lifetime=params["lifetime"], # optional params below annotation=params.get("annotation"), **kwargs) def create_url_hit(self, params): """ creates a HIT for an external question with a specified URL :param params a dict of the HIT parameters. must contain a "url" parameter :return the created HIT object """ question = ExternalQuestion(params["url"], params["frame_height"]) return self._create_hit(params, question=question) def create_html_hit(self, params): """ creates a HIT for a question with the specified HTML :param params a dict of the HIT parameters, must contain a "html" parameter :return the created HIT object """ question = HTMLQuestion(params["html"], params["frame_height"]) return self._create_hit(params, question=question) def create_layout_hit(self, params): """ creates a HIT for a question using the supplied layout id :param params a dict of the HIT parameters, must contain a "hit_layout" parameters with the layout id, and a "layout_params" parameter that's the dict of parameters to feed to the layout. """ # create the LayoutParameters object from the supplied params layout_params = LayoutParameters([ LayoutParameter(name, value) for name, value in params["layout_params"] ]) return self._create_hit(params, hit_layout=params["hit_layout"], layout_params=layout_params) def delete_all_hits(self): """ Permanently disables/ deletes all of the user's active HITs. :param mturk_connection: active mturk connection established by user in the notebook. :return: """ my_hits = list(self.get_all_hits()) for hit in my_hits: self.connection.disable_hit(hit.HITId) def get_assignments_object_list(self, assignment_dict): """ Returns a list of "<boto.mturk.connection.Assignment object at...>" objects assignment_dict: a dictionary of HITId-assignment object pairs """ assignments = [] for entry in assignment_dict: for assignment_object in assignment_dict[entry]: assignments.append(assignment_object) return assignments def get_results_dict(self, HIT_assignments): """ Takes a list of HIT assignment objects as input. Returns a list of dictionaries of HITs containing: HIT_id: the HIT ID worker_id: the worker ID of the Turker who completed the HIT answers: a dictionary of qid-answer field value pairs """ assignment_results = [] for assignment in HIT_assignments: HIT_dict = {} HIT_dict["assignment_object"] = assignment HIT_dict["worker_Id"] = assignment.WorkerId HIT_dict["HIT_id"] = assignment.HITId answers_dict = {} for answer in assignment.answers[0]: answers_dict[answer.qid] = answer.fields HIT_dict["answers"] = answers_dict assignment_results.append(HIT_dict) return assignment_results def get_all_results(self, hits): all_results = {} for hid, assignments in self.get_assignments(hits).items(): all_results[hid] = self.get_results_dict(assignments) return all_results def get_reviewable_hits(self, annotations=None, detailed=False): """ Get all the reviewable HITs. By default returns minimal HIT objects, but will return detailed ones (by necessity) if annotations is specified or if detailed is True :param annotations an optional set of annotations to retrieve HITs for :param detailed do you want detailed HIT objects or minimal ones :return a list of HIT objects """ minimal_hits = [] page_num = 1 while True: more_hits = self.connection.get_reviewable_hits( page_size=100, page_number=page_num) if more_hits: minimal_hits.extend(more_hits) page_num += 1 else: break if detailed or annotations is not None: detailed_hits = [ self.connection.get_hit(hit.HITId, response_groups=('Minimal', 'HITDetail')) for hit in minimal_hits ] return [ hit for hit in detailed_hits if annotation_filter(annotations, hit) ] else: return minimal_hits def get_all_hits(self, annotations=None): """ Get all the HITs. :param annotations a set of annotations to get HITs for, all HITs if not specified :return a list of HIT objects """ return [ hit for hit in self.connection.get_all_hits() if annotation_filter(annotations, hit) ] def get_assignments(self, hits=None, hit_ids=None, status=None): """ Retrieves individual assignments associated with the supplied HITs :param hits the HITs to get assignments for :status HIT status to filter by :return dict from HITId to lists of assignments """ if hit_ids is None: hit_ids = [hit.HITId for hit in hits] return { hit_id: self.connection.get_assignments(hit_id, status=status) for hit_id in hit_ids } def disable_hit(self, hit=None, hit_id=None): """ disable the specified hit (or the hit with the specified id). must specify either `hit` or `hit_id` :param hit a HIT object to disable :param hit_id a HITId to disable """ hit_id = hit.HITId if hit is not None else hit_id return self.connection.disable_hit(hit_id) def approve_assignment(self, assignment=None, assignment_id=None, feedback=None): """ approve the specified assignment (or the assigment with the specified id) must specify either `assignment` or `assignment_id` :param assignment an assignment object to approve :param assignment_id an AssignmentId to approve :param feedback optional feedback for the worker """ assignment_id = assignment.AssignmentId if assignment is not None else assignment_id return self.connection.approve_assignment(assignment_id, feedback) def reject_assignment(self, assignment=None, assignment_id=None, feedback=None): """ reject the specified assignment (or the assigment with the specified id) must specify either `assignment` or `assignment_id` :param assignment an assignment object to reject :param assignment_id an AssignmentId to reject :param feedback optional feedback for the worker """ assignment_id = assignment.AssignmentId if assignment is not None else assignment_id return self.connection.reject_assignment(assignment_id, feedback)
# print 'here' if answer == control_labels[question]: approve = True # print '%s\t%s'%(question_form_answer.qid, question_form_answer.fields[0]) if approve == False: for assignment in assignments: for question_form_answer in assignment.answers[0]: f.writelines(question_form_answer.qid.encode('ascii', 'ignore')) conn.reject_assignment(assignment.AssignmentId) conn.disable_hit(hit.HITId) else: for assignment in assignments: for question_form_answer in assignment.answers[0]: f1.write(question_form_answer.qid.encode('ascii', 'ignore')) f1.write(question_form_answer.fields[0].encode('ascii', 'ignore') + '\n') conn.approve_assignment(assignment.AssignmentId) conn.disable_hit(hit.HITId) #Uncomment to approve assignment. Approving will remove this assignment from reviewable HITs, so store the data before approving #Uncomment to remove all remaining assignments that have not been completed and approved/rejected
def handle(self, *args, **options): # create a connection mturk = MTurkConnection( getattr(settings, 'MTURK_AWS_KEY', settings.MEDIASYNC['AWS_KEY']), getattr(settings, 'MTURK_AWS_SECRET', settings.MEDIASYNC['AWS_SECRET']), host='mechanicalturk.sandbox.amazonaws.com' if options['sandbox'] else 'mechanicalturk.amazonaws.com') # if --delete, delete all the old ones first. if options['delete_first']: for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId) if options['exclude']: exclude_reader = csv.DictReader(open(options['exclude'], 'r')) exclude = set() for row in exclude_reader: exclude.add(row['td_id']) # iterate over items and create them one by one cursor = connection.cursor() cursor.execute( """ select entity_id, type from matchbox_wikipediainfo, matchbox_entity where entity_id not in (select entity_id from matchbox_sunlightinfo where bio is not null) and bio != '' and bio is not null and entity_id = matchbox_entity.id %s order by entity_id limit %s; """ % ( "and type = '%s'" % options['type'] if options['type'] else '', '%s' ), # hack to put the interpolation string back in for PG to catch it [options['count']]) for row in cursor: if options['exclude']: if str(row[0]).replace('-', '') in exclude: continue if options['practice']: print row[0] continue try: hit = mturk.create_hit( question=FakeQuestionForm(get_hit_xml(row[0])), max_assignments=3, annotation=row[0], title="Wikipedia match validation", description= "We have matched a set of entities in a database to descriptions pulled from Wikipedia via an automated process. Confirm that the match is correct.", reward=0.06, duration=datetime.timedelta(minutes=30), lifetime=datetime.timedelta(days=7), keywords=['wikipedia', 'matching'], approval_delay=datetime.timedelta(days=3), qualifications=Qualifications([ PercentAssignmentsApprovedRequirement( "GreaterThan", 90) ])) print hit[0].HITId except Exception as e: sys.stderr.write("Failed to create hit %s\n" % row[0]) sys.stderr.write(getattr(e, 'body', '')) sys.stderr.write('\n') except: pass
description='Approve work from Amazon Mechanical Turk') parser.add_argument('-r', '--resultsfile', required=True, help='Filename for tab delimited CSV file') parser.add_argument( '-s', '--sandbox', action='store_true', help= 'Run the command in the Mechanical Turk Sandbox (used for testing purposes)' ) args = parser.parse_args() if args.sandbox: if not config.has_section('MTurk'): config.add_section('MTurk') config.set('MTurk', 'sandbox', 'True') mturk_website = 'requestersandbox.mturk.com' results = pd.read_csv(args.resultsfile, sep='\t') mtc = MTurkConnection(is_secure=True) for i, j in enumerate(list(results['assignmentid'])): print("deleting hit..{0}".format(i)) try: mtc.disable_hit(results['hitid'][i]) except: continue
# -*- coding: utf-8 -*- import os from flask import Flask, render_template, url_for, request, make_response from boto.mturk.connection import MTurkConnection from boto.mturk.question import ExternalQuestion from boto.mturk.qualification import Qualifications, PercentAssignmentsApprovedRequirement, NumberHitsApprovedRequirement from boto.mturk.price import Price import sys hit = sys.argv[1] #Start Configuration Variables AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID'] AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY'] if len(sys.argv) > 2 and sys.argv[2] == 'pub': AMAZON_HOST = "mechanicalturk.amazonaws.com" else: AMAZON_HOST = "mechanicalturk.sandbox.amazonaws.com" connection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=AMAZON_HOST) connection.disable_hit(hit)
question = question_form_answer.qid.replace('\n', '') answer = question_form_answer.fields[0] # print question if question in control_labels.keys(): # print 'here' if answer == control_labels[question]: approve = True # print '%s\t%s'%(question_form_answer.qid, question_form_answer.fields[0]) if approve == False: for assignment in assignments: for question_form_answer in assignment.answers[0]: f.writelines(question_form_answer.qid.encode( 'ascii', 'ignore')) conn.reject_assignment(assignment.AssignmentId) conn.disable_hit(hit.HITId) else: for assignment in assignments: for question_form_answer in assignment.answers[0]: f1.write(question_form_answer.qid.encode('ascii', 'ignore')) f1.write( question_form_answer.fields[0].encode('ascii', 'ignore') + '\n') conn.approve_assignment(assignment.AssignmentId) conn.disable_hit(hit.HITId) #Uncomment to approve assignment. Approving will remove this assignment from reviewable HITs, so store the data before approving #Uncomment to remove all remaining assignments that have not been completed and approved/rejected
def deleteAllHits(): # this function should probably take an input parameter # of a pickle file with the hits to be disposed... mtc = MTurkConnection(host=_host) for hit in mtc.get_all_hits(): mtc.disable_hit( hit.HITId )
host=HOST) url = "https://mturk-poc.herokuapp.com/" title = "Describe this group of people in your own words" description = "Describe your first impressions of this group of people however you want." keywords = ["easy"] frame_height = 800 amount = 0.05 questionform = ExternalQuestion(url, frame_height) all_hits = [hit for hit in connection.get_all_hits()] if all_hits: for hit in all_hits: connection.disable_hit(hit.HITId) create_hit_result = connection.create_hit( title=title, description=description, keywords=keywords, max_assignments=4, lifetime=datetime.timedelta(hours=2), question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), ) all_hits = [hit for hit in connection.get_all_hits()] for hit in all_hits:
def cleanup(): """Remove any boto test related HIT's""" conn = MTurkConnection(host='mechanicalturk.sandbox.amazonaws.com') current_page = 1 page_size = 10 total_disabled = 0 ignored = [] while True: # reset the total for this loop disabled_count = 0 # search all the hits in the sandbox search_rs = conn.search_hits(page_size=page_size, page_number=current_page) # success? if search_rs.status: for hit in search_rs: # delete any with Boto in the description print 'hit id:%s Status:%s, desc:%s' % ( hit.HITId, hit.HITStatus, hit.Description) if hit.Description.find('Boto') != -1: if hit.HITStatus != 'Reviewable': print 'Disabling hit id:%s %s' % (hit.HITId, hit.Description) disable_rs = conn.disable_hit(hit.HITId) if disable_rs.status: disabled_count += 1 # update the running total total_disabled += 1 else: print 'Error when disabling, code:%s, message:%s' % ( disable_rs.Code, disable_rs.Message) else: print 'Disposing hit id:%s %s' % (hit.HITId, hit.Description) dispose_rs = conn.dispose_hit(hit.HITId) if dispose_rs.status: disabled_count += 1 # update the running total total_disabled += 1 else: print 'Error when disposing, code:%s, message:%s' % ( dispose_rs.Code, dispose_rs.Message) else: if hit.HITId not in ignored: print 'ignored:%s' % hit.HITId ignored.append(hit.HITId) # any more results? if int(search_rs.TotalNumResults) > current_page * page_size: # if we have disabled any HITs on this page # then we don't need to go to a new page # otherwise we do if not disabled_count: current_page += 1 else: # no, we're done break else: print 'Error performing search, code:%s, message:%s' % ( search_rs.Code, search_rs.Message) break total_ignored = len(ignored) print 'Processed: %d HITs, disabled/disposed: %d, ignored: %d' % ( total_ignored + total_disabled, total_disabled, total_ignored)
class TranscriptionPipelineHandler(): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn, TEMPLATE_DIR) self.mh = MongoTranscriptionHandler() self.wh = WavHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.logger = logging.getLogger( "transcription_engine.transcription_pipeline_handler") def audio_clip_referenced_to_hit(self, priority=1, max_queue_size=10): for audio_clip in self.mh.get_artifacts_by_state( "audio_clips", "Referenced"): audio_clip_id = audio_clip["_id"] self.mh.queue_clip(audio_clip_id, priority, max_queue_size) response = self.audio_clip_queue_to_hit() def audio_clip_queued_to_hit(self, priority=1, max_queue_size=10): for audio_clip in self.mh.get_artifacts("audio_clips", {"state": "Queued"}): audio_clip_id = audio_clip["_id"] response = self.audio_clip_queue_to_hit() #=================================================================== # elif state == "Hit": # print("In hit: %s"%audio_clip_url) #=================================================================== def audio_clip_queue_to_hit(self, cost_sensitive=True): """Take queued audio clips from the audio clip queue put them in a hit and create the hit. If successful, update the audio clip state.""" clip_queue = self.mh.get_audio_clip_queue() clip_pairs = self.mh.get_audio_clip_pairs(clip_queue) if clip_pairs: hit_title = "Audio Transcription" question_title = "List and Transcribe" description = "Transcribe the audio clip by typing the words the person says in order." keywords = "audio, transcription, audio transcription" if cost_sensitive: reward_per_clip = 0.02 max_assignments = 3 estimated_cost = self.hh.estimate_html_HIT_cost( clip_pairs, reward_per_clip, max_assignments) clips_in_hits = self.mh.clips_already_in_hit(clip_pairs) if clips_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_audio_clips_from_queue(clips_in_hits) elif self.balance - estimated_cost >= 250: #if we have enough money, create the HIT response = self.hh.make_html_transcription_HIT( clip_pairs, hit_title, question_title, description, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len( response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_audio_clips_from_queue(clip_queue) audio_clip_ids = [ w["audio_clip_id"] for w in clip_queue ] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_transcription_hit_artifact( hit_id, hit_type_id, clip_queue, "New") self.logger.info("Successfully created HIT: %s" % hit_id) return self.mh.update_audio_clips_state( audio_clip_ids, "Hit") else: pass return False def load_assignments_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_ids.append(assignment.AssignmentId) if self.mh.get_artifact("assignments", {"_id": assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue else: have_all_assignments = False transcription_ids = [] transcription_dicts = self.ah.get_assignment_submitted_transcriptions( assignment) if transcription_dicts and len(transcription_dicts) == 10: pass for transcription in transcription_dicts: if not self.mh.get_artifact_by_id( "audio_clips", transcription["audio_clip_id"]): self.logger.info("Assignment(%s) with unknown audio clip(%s) skipped"%\ (assignment.AssignmentId,transcription["audio_clip_id"])) break self.mh.update_transcription_state(transcription, "Submitted") self.mh.update_audio_clips_state( [transcription["audio_clip_id"]], "Submitted") transcription_ids.append( self.mh.get_artifact( "transcriptions", { "audio_clip_id": transcription["audio_clip_id"], "assignment_id": transcription["assignment_id"] }, "_id")) else: self.mh.create_assignment_artifact(assignment, transcription_ids, "Submitted") if assignments and not have_all_assignments: self.mh.update_transcription_hit_state(hit_id, "Submitted") print("Transcriptions HIT(%s) submitted assignments: %s " % (hit_id, assignment_ids)) def assignment_submitted_approved(self): """For all submitted assignments, if an answered question has a reference transcription, check the WER. If all the answered questions with reference transcriptions have an acceptable WER, approve the assignment and update the audio clips and transcriptions.""" assignments = self.mh.get_artifacts_by_state("assignments", "Submitted") rejected_feedback = "I'm sorry but your work in assignment(%s) was rejected because" +\ " one or more of your transcriptions " +\ " had a word error rate above the maximum acceptable"+\ " word error rate of %s. Omitted words and words that "+\ " differed by more than %s "+\ " characters were counted as an error." accepted_feedback = "Your average word error rate on assignment(%s) was %s."+\ " Assignment accepted! Thanks for your hard work." for assignment in assignments: assignment_id = assignment["_id"] transcription_ids = assignment["transcriptions"] transcriptions = self.mh.get_artifacts("transcriptions", "_id", transcription_ids) worker_id = assignment["worker_id"] worker_id = self.mh.create_worker_artifact(worker_id) approved, average_wer = self.filter.approve_assignment( transcriptions) if approved: try: self.conn.approve_assignment( assignment_id, accepted_feedback % (assignment_id, average_wer)) except MTurkRequestError as e: print(e) else: self.mh.update_assignment_state(assignment, "Approved") for transcription in transcriptions: #Approve transcriptions without references in the same assignment reference_id = self.mh.get_artifact_by_id( "audio_clips", transcription["audio_clip_id"], "reference_transcription_id") if not reference_id: self.mh.update_transcription_state( transcription, "Approved") print("Approved transcription ids: %s" % transcription_ids) else: #Don't deny for now feedback = rejected_feedback % (assignment_id, self.filter.WER_THRESHOLD, self.filter.CER_THRESHOLD) self.logger.info(feedback) self.conn.reject_assignment(assignment_id, feedback) self.mh.update_assignment_state(assignment, "Denied") #print("Assignments not aproved %s "%denied) #Update the worker if approved: self.mh.add_assignment_to_worker(worker_id, (assignment_id, average_wer)) def _load_rm_audio_source_file_to_clipped( self, file_dir, prompt_file_uri, base_clip_dir, sample_rate=16000, http_base_url="http://www.cis.upenn.edu/~tturpen/wavs/", init_clip_count=200): """For an audio directory, see which files are new and not an audio source already """ prompt_dict = self.ph.get_prompts(prompt_file_uri) count = 0 for root, dirs, files in os.walk(file_dir): for f in files: if count == init_clip_count: return system_uri = os.path.join(root, f) out_uri = system_uri.strip(".sph") + ".wav" out_uri = os.path.basename(out_uri) out_uri = os.path.join(root, (out_uri)) spkr_id = str(os.path.relpath(root, file_dir)) #sph to wav if not f.endswith(".wav") and not os.path.exists(out_uri): try: self.wh.sph_to_wav(system_uri, out_uri=out_uri) except WavHandlerException as e: self.logger.error("Unable to create wav from sph: " + str(e)) if os.path.exists(out_uri) and out_uri.endswith(".wav"): #create audio source artifact count += 1 wav_filename = os.path.basename(out_uri) prompt_id = os.path.basename(out_uri).strip(".wav").upper() encoding = ".wav" sample_rate = 16000 disk_space = os.stat(out_uri).st_size length_seconds = self.wh.get_audio_length(out_uri) if prompt_id in prompt_dict: transcription_prompt = prompt_dict[prompt_id] else: #No prompt found raise PromptNotFound source_id = self.mh.create_audio_source_artifact( out_uri, disk_space, length_seconds, sample_rate, spkr_id, encoding) #create audio clip artifact audio_clip_uri = os.path.join(base_clip_dir, spkr_id, wav_filename) clip_dir = os.path.dirname(audio_clip_uri) if not os.path.exists(clip_dir): os.makedirs(clip_dir) if not os.path.exists(audio_clip_uri): copyfile(out_uri, audio_clip_uri) #http_url http_url = os.path.join(http_base_url, spkr_id, wav_filename) clip_id = self.mh.create_audio_clip_artifact( source_id, 0, -1, audio_clip_uri, http_url, length_seconds, disk_space) #Update the audio source, updates state too self.mh.update_audio_source_audio_clip(source_id, clip_id) #Create the reference transcription artifact transcription_id = self.mh.create_reference_transcription_artifact( clip_id, transcription_prompt, "Gold") #Completes audio clip to Referenced self.mh.update_audio_clip_reference_transcription( clip_id, transcription_id) def all_workers_liveness(self): workers = self.mh.get_all_workers() for worker in workers: worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments(worker) print("Worker(%s) assignments, approved(%s) denied(%s)" % (worker["_id"], approved, denied)) selection = input( "1. Show denied transcriptions and references.\n" + "2. Show accepted transcriptions and references.\n" + "3. Show both denied and accepted transcriptions.") if selection == 1 or selection == 3: print("Approved transcriptions") for assignment_id in approved: transcription_pairs = self.mh.get_transcription_pairs( assignment_id) for pair in transcription_pairs: print("Reference:\n\t%s\nHypothesis:\n\t%s\n" % (pair[0], pair[1])) if selection == 2 or selection == 3: print("Denied transcriptions") for assignment_id in denied: transcription_pairs = self.mh.get_transcription_pairs( assignment_id) for pair in transcription_pairs: print("Reference:\n\t%s\nHypothesis:\n\t%s\n" % (pair[0], pair[1])) def stats(self): workers = self.mh.get_all_workers() all_wer_per_approved_assignment = 0.0 total_accepted = 0.0 for worker in workers: worker_wer = 0.0 worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments_wer(worker) for w in approved: all_wer_per_approved_assignment += float(w[1]) worker_wer += float(w[1]) total_accepted += 1 if approved: worker_average_wer = worker_wer / len(approved) print("%s,%s" % (len(approved), worker_average_wer)) #print("Worker(%s) approved assignments(%s)\n denied assignments(%s)"%(worker_id,approved,denied)) av = all_wer_per_approved_assignment / total_accepted print("Average WER per assignment(%s)" % (av)) def get_assignment_stats(self): self.effective_hourly_wage_for_approved_assignments(.20) def effective_hourly_wage_for_approved_assignments(self, reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state( "assignments", "Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: if "SubmitTime" in assignment: accepted = datetime.datetime.strptime(assignment["AcceptTime"], "%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime( assignment["SubmitTime"], "%Y-%m-%dT%H:%M:%SZ") else: pass total += submitted - accepted count += 1 seconds_per_assignment = total.total_seconds() / count effective_hourly_wage = 60.0 * 60.0 / seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s" % (seconds_per_assignment, reward_per_assignment, effective_hourly_wage)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() for hit in hits: hit_id = hit.HITId print("HIT ID: %s" % hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)" ) == "y": try: self.conn.disable_hit(hit_id) clips = self.mh.get_artifact("transcription_hits", {"_id": hit_id}, "clips") self.mh.remove_transcription_hit(hit_id) self.mh.update_audio_clips_state(clips, "Referenced") except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)" % len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/ind_trn" #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" base_clip_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/clips" selection = 0 init_clip_count = 10000 while selection != "11": selection = raw_input( """Audio Source file to Audio Clip Approved Pipeline:\n 1: AudioSource-FileToClipped: Initialize Resource Management audio source files to %d queueable(Referenced) clips 2: AudioClip-ReferencedToHit: Queue all referenced audio clips and create a HIT if the queue is full. 3: AudioClip-HitToSubmitted: Check all submitted assignments for Transcriptions. 4: AudioClip-SubmittedToApproved: Check all submitted clips against their reference. 5: Review Current Hits 6: Worker liveness 7: Account balance 8: Worker stats 9: Recalculate worker WER 10: Assignment Stats 11: Exit """ % init_clip_count) #selection = "5" if selection == "1": self._load_rm_audio_source_file_to_clipped( audio_file_dir, prompt_file_uri, base_clip_dir, init_clip_count=init_clip_count) elif selection == "2": self.audio_clip_referenced_to_hit() elif selection == "3": self.load_assignments_hit_to_submitted() elif selection == "4": self.assignment_submitted_approved() elif selection == "5": self.allhits_liveness() elif selection == "6": self.all_workers_liveness() elif selection == "7": print("Account balance: %s" % self.balance) elif selection == "8": self.stats() elif selection == "9": self.recalculate_worker_assignment_wer() elif selection == "10": self.get_assignment_stats() # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime) # def recalculate_worker_assignment_wer(self): # """For all submitted assignments, # if an answered question has a reference transcription, # check the WER. # If all the answered questions with reference transcriptions # have an acceptable WER, approve the assignment and update # the audio clips and transcriptions.""" # assignments = self.mh.get_artifacts("assignments",{"state":"Approved"}) # for assignment in assignments: # assignment_id = assignment["_id"] # denied = [] # #If no transcriptions have references then we automatically approve the HIT # approved = True # transcription_ids = assignment["transcriptions"] # transcriptions = self.mh.get_transcriptions("_id",transcription_ids) # worker_id = assignment["worker_id"] # worker_id = self.mh.create_worker_artifact(worker_id) # # max_rej_wer = (0.0,0.0) # total_wer = 0.0 # for transcription in transcriptions: # #Normalize the transcription # #self.mh.normalize_transcription # reference_id = self.mh.get_audio_clip_by_id(transcription["audio_clip_id"],"reference_transcription_id") # if reference_id: # reference_transcription = self.mh.get_reference_transcription({"_id": reference_id}, # "transcription") # new_transcription = transcription["transcription"].split(" ") # if reference_transcription: # transcription_wer = cer_wer(reference_transcription,new_transcription) # total_wer += transcription_wer # if transcription_wer < WER_THRESHOLD: # self.logger.info("WER for transcription(%s) %d"%(transcription["transcription"],transcription_wer)) # else: # max_rej_wer = (transcription_wer,WER_THRESHOLD) # denied.append((reference_transcription,new_transcription)) # approved = False # average_wer = total_wer/len(transcriptions) # #Update the worker # self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer))