def getReviewableAssignments(verbose=False): _assignments = [] _mtc = MTurkConnection(host=_host) _num_hits = sum(1 for _ in _mtc.get_all_hits()) print "Number HITs: [%d]" % (_num_hits) count = 0 for hit in _mtc.get_all_hits(): count += 1 print count if verbose: print "-------------------------------------" print hit.HITId print "" _hit_assignments = _mtc.get_assignments(hit.HITId, page_size=50) _num_assignments = sum(1 for _ in _hit_assignments) print " - number assignments: [%d]" % (_num_assignments) _assignments.extend([ _assignment for _assignment in _mtc.get_assignments(hit.HITId, page_size=50) ]) return _assignments
def get_progress(HITId): mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = mtc.get_all_hits() hits_dict = dict() for hit in hits: hits_dict[hit.HITId] = hit curr_hit = hits_dict[HITId] assign_list = [] assignments = mtc.get_assignments(curr_hit.HITId) for assignment in assignments: assign_list.append(assignment) num_assign_completed = len(assign_list) num_assign_total = int(curr_hit.MaxAssignments) progress = (float(num_assign_completed) / float(num_assign_total)) * 100 return progress
def getReviewableAssignments(): mtc = MTurkConnection( host = _host ) # note: if there are more than 100 assignments per hit the function # must be modified to retrieve all pages of the assignments page_size = 100 _assignments = [] #_num_hits = sum(1 for _ in mtc.get_all_hits()) #print "Total Number of HITs: [%d]" %(_num_hits) _num_reviewable = 0 _num_hits = 0 print "Analyzed [%d] HITs" %(_num_hits+1) for hit in mtc.get_all_hits(): _num_hits += 1 if _num_hits % 500 == 0: print "Analyzed [%d] HITs" %_num_hits tmp_assign = [_assignment for _assignment in mtc.get_assignments( hit.HITId, page_size = page_size )] if len( tmp_assign ) == NUMBER_HIT_ASSIGNMENTS: _num_reviewable += 1 _assignments.extend( tmp_assign ) print "Total Number of HITs: [%d]" %( _num_hits ) print "Total Number of Assignments: [%d]" %( len(_assignments) ) print "Total Number of Reviewavle HITs: [%d]" %( _num_reviewable ) return _assignments
def handle(self, *args, **options): # create a connection mturk = MTurkConnection( getattr(settings, 'MTURK_AWS_KEY', settings.MEDIASYNC['AWS_KEY']), getattr(settings, 'MTURK_AWS_SECRET', settings.MEDIASYNC['AWS_SECRET']), host = 'mechanicalturk.sandbox.amazonaws.com' if options['sandbox'] else 'mechanicalturk.amazonaws.com' ) # if --delete, delete all the old ones first. if options['delete_first']: for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId) if options['exclude']: exclude_reader = csv.DictReader(open(options['exclude'], 'r')) exclude = set() for row in exclude_reader: exclude.add(row['td_id']) # iterate over items and create them one by one cursor = connection.cursor() cursor.execute( """ select entity_id, type from matchbox_wikipediainfo, matchbox_entity where entity_id not in (select entity_id from matchbox_sunlightinfo where bio is not null) and bio != '' and bio is not null and entity_id = matchbox_entity.id %s order by entity_id limit %s; """ % ("and type = '%s'" % options['type'] if options['type'] else '', '%s'), # hack to put the interpolation string back in for PG to catch it [options['count']]) for row in cursor: if options['exclude']: if str(row[0]).replace('-', '') in exclude: continue if options['practice']: print row[0] continue try: hit = mturk.create_hit( question = FakeQuestionForm(get_hit_xml(row[0])), max_assignments = 3, annotation = row[0], title = "Wikipedia match validation", description = "We have matched a set of entities in a database to descriptions pulled from Wikipedia via an automated process. Confirm that the match is correct.", reward = 0.06, duration = datetime.timedelta(minutes=30), lifetime = datetime.timedelta(days=7), keywords = ['wikipedia', 'matching'], approval_delay = datetime.timedelta(days=3), qualifications = Qualifications([PercentAssignmentsApprovedRequirement("GreaterThan", 90)]) ) print hit[0].HITId except Exception as e: sys.stderr.write("Failed to create hit %s\n" % row[0]) sys.stderr.write(getattr(e, 'body', '')) sys.stderr.write('\n') except: pass
def get_final_score(HITId): mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = mtc.get_all_hits() hits_dict = dict() for hit in hits: hits_dict[hit.HITId] = hit curr_hit = hits_dict[HITId] sum_opin = 0 sum_acc = 0 index = 0 assignments = mtc.get_assignments(curr_hit.HITId) for assignment in assignments: #print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: for key in question_form_answer.fields: if question_form_answer.qid == 'design': #print "%s" % (key) index=index+1 sum_opin+=int(key) else: sum_acc += answer_key(key) mtc.approve_assignment(assignment.AssignmentId) #print "--------------------" mtc.disable_hit(curr_hit.HITId) #print "Average Score %s" % (sum_opin/index) #print "Legible Accuracy: %s%%" % (sum_acc/index) avg_ratings = float(sum_opin) / float(index) avg_ratings_score = avg_ratings * 25 avg_legib_score = float(sum_acc) / float(index) # Calculate weighted average, # 60% for compare match score, # 40^% for ratings weighted_avg = 0.70*avg_legib_score + 0.30*avg_ratings return weighted_avg
for i in xrange(NUMBEROFWORKERPOOLS): f = open('log/aql' + str(i),'w') f.write('') f.close() chmod('log/aql' +str(i),0o777) # f = open('log/aql0', 'w') # f.write('') # f.close() # chmod('log/aql0', 0o777) # # f = open('log/aql1', 'w') # f.write('') # f.close() # chmod('log/aql1', 0o777) if not SIMULATION: if SANDBOX: mturk = MTurkConnection(AWSAKID, AWSSAK, host='mechanicalturk.sandbox.amazonaws.com') else: mturk = MTurkConnection(AWSAKID, AWSSAK, host='mechanicalturk.amazonaws.com') for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId)
HOST = 'mechanicalturk.amazonaws.com' connection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=HOST) url = "https://mturk-poc.herokuapp.com/" title = "Describe this group of people in your own words" description = "Describe your first impressions of this group of people however you want." keywords = ["easy"] frame_height = 800 amount = 0.05 questionform = ExternalQuestion(url, frame_height) all_hits = [hit for hit in connection.get_all_hits()] if all_hits: for hit in all_hits: connection.disable_hit(hit.HITId) create_hit_result = connection.create_hit( title=title, description=description, keywords=keywords, max_assignments=4, lifetime=datetime.timedelta(hours=2), question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), )
args = parser.parse_args() if args.sandbox: if not config.has_section('MTurk'): config.add_section('MTurk') config.set('MTurk', 'sandbox', 'True') hitids = None with open(expanduser(args.successfile), 'r') as successfile: hitids = [row['hitid'] for row in DictReader(successfile, delimiter='\t')] mtc = MTurkConnection(is_secure=True, profile_name=args.profile) # To get any information about status, you have to get the HIT via get_all_hits # If you just use get_hit() it gets minimal info all_hits = mtc.get_all_hits() currhits = [] for h in all_hits: if h.HITId in hitids: currhits.append(h) # get_all_hits iterates through all your current HITs, grabbing 100 at a time # best to break as soon as you get all the HITIds in your group if len(currhits) == len(hitids): break for c in currhits: print(display_hit(c, verbose=True)) #print('HITId: {}'.format(c.HITId)) # print('HITTypeId: {}'.format(c.HITTypeId))
def handle(self, *args, **options): # create a connection mturk = MTurkConnection( getattr(settings, 'MTURK_AWS_KEY', settings.MEDIASYNC['AWS_KEY']), getattr(settings, 'MTURK_AWS_SECRET', settings.MEDIASYNC['AWS_SECRET']), host='mechanicalturk.sandbox.amazonaws.com' if options['sandbox'] else 'mechanicalturk.amazonaws.com') # if --delete, delete all the old ones first. if options['delete_first']: for hit in mturk.get_all_hits(): mturk.disable_hit(hit.HITId) if options['exclude']: exclude_reader = csv.DictReader(open(options['exclude'], 'r')) exclude = set() for row in exclude_reader: exclude.add(row['td_id']) # iterate over items and create them one by one cursor = connection.cursor() cursor.execute( """ select entity_id, type from matchbox_wikipediainfo, matchbox_entity where entity_id not in (select entity_id from matchbox_sunlightinfo where bio is not null) and bio != '' and bio is not null and entity_id = matchbox_entity.id %s order by entity_id limit %s; """ % ( "and type = '%s'" % options['type'] if options['type'] else '', '%s' ), # hack to put the interpolation string back in for PG to catch it [options['count']]) for row in cursor: if options['exclude']: if str(row[0]).replace('-', '') in exclude: continue if options['practice']: print row[0] continue try: hit = mturk.create_hit( question=FakeQuestionForm(get_hit_xml(row[0])), max_assignments=3, annotation=row[0], title="Wikipedia match validation", description= "We have matched a set of entities in a database to descriptions pulled from Wikipedia via an automated process. Confirm that the match is correct.", reward=0.06, duration=datetime.timedelta(minutes=30), lifetime=datetime.timedelta(days=7), keywords=['wikipedia', 'matching'], approval_delay=datetime.timedelta(days=3), qualifications=Qualifications([ PercentAssignmentsApprovedRequirement( "GreaterThan", 90) ])) print hit[0].HITId except Exception as e: sys.stderr.write("Failed to create hit %s\n" % row[0]) sys.stderr.write(getattr(e, 'body', '')) sys.stderr.write('\n') except: pass
class MTurkClient: # SETUP # =========== def __init__(self,aws_access_key,aws_secret_key,aws_mode): self.mode = aws_mode if aws_mode == 'sandbox': self.host = 'mechanicalturk.sandbox.amazonaws.com' else: self.host = 'mechanicalturk.amazonaws.com' self.c = MTurkConnection( aws_access_key, aws_secret_key, host=self.host) default_settings = { 'lifetime': DAY, 'duration': 10 * MINUTE, 'approval_delay': DAY, 'title': "[title]", 'description': "[description]", 'keywords': [], 'reward': 0.01, 'max_assignments': 1, 'height': 700, 'qualifications': [], } # HITS # =========== def create_hit(self,url,extra_settings): "Eventually, this should take a TEMPLATE and a dictionary of INPUT data that's put into that template. This function would then create an HTML file locally (assuming we're running on a web server) by replacing template {tags} with input values, and then send the URL to the newly created page to MTurk." settings = self.default_settings.copy() settings.update(extra_settings) settings['reward'] = Price(settings['reward']) settings['qualifications'] = qualification.Qualifications(settings['qualifications']) settings['keywords'] = ','.join(settings['keywords']) height = settings.pop('height') hit = self.c.create_hit(question=ExternalQuestion(url,height),**settings)[0] #print 'Created hit %s' % hit.HITId return hit.HITId,hit.HITTypeId #hit_type=None, # Let Amazon do this automatically #annotation=None, # Optional annotation for our system to use #questions=None, # If you want to create multiple HITs at a time? Probably irrelevant for External #response_groups=None, # Unclear what this does def get_hit(self,hit_id): return self.c.get_hit(hit_id)[0] def hit_results(self,hit_id,type=None): # type in ['Submitted','Approved','Rejected',None] results = {} assignments = self.c.get_assignments(hit_id, status=None, page_size=100) for asst in assignments: results.setdefault(asst.AssignmentId,{}) answers = asst.answers[0] for qfa in answers: field, response = qfa.qid, qfa.fields[0] results[asst.AssignmentId][field] = response results[asst.AssignmentId]['worker_id'] = asst.WorkerId results[asst.AssignmentId]['accept_time'] = datetime.strptime(asst.AcceptTime,"%Y-%m-%dT%H:%M:%SZ") results[asst.AssignmentId]['submit_time'] = datetime.strptime(asst.SubmitTime,"%Y-%m-%dT%H:%M:%SZ") return results # URL of a HIT on MTurk def hit_url_turk(self,hit_id): pass def hit_url_external(self,hit_id): pass def extend_hit(self,hit_id,extras): return self.c.extend_hit(hit_id, extras) @catcherror def delete_hit(self,hit_id): self.c.disable_hit(hit_id) # Deletes all the HITS on the server. Risky! def cleanup(self): for hit in self.c.get_all_hits(): self.delete_hit(hit.HITId) # ASSIGNMENTS # =========== @catcherror def approve(self, asst_id, feedback=None): return self.c.approve_assignment(asst_id, feedback) @catcherror def reject(self, asst_id, feedback=None): return self.c.reject_assignment(asst_id, feedback) def block(self,worker_id,feedback=None): return self.c.block_worker(worker_id, feedback) def unblock(self,worker_id,feedback=None): return self.c.unblock_worker(worker_id, feedback) def bonus(self,asst,amount,feedback): return self.c.grant_bonus(asst.worker, asst.asst_id, Price(amount), feedback) # STATUS / DIAGNOSTICS # -------------------- def balance(self): return self.c.get_account_balance()[0]
# Mturk access_key_id, secretkey = parse_credentials_file(section_name='MTurkCredentials') if not sandbox: conn = MTurkConnection(aws_access_key_id=access_key_id, aws_secret_access_key=secretkey) else: conn = MTurkConnection(aws_access_key_id=access_key_id, aws_secret_access_key=secretkey, host='mechanicalturk.sandbox.amazonaws.com') # --- hits --- retrieve Pose Judgement experiments only allhits = [hit for hit in conn.get_all_hits()] hits = [] for hit in allhits: if 'Pose' in hit.Title and 'changed' in hit.Title: hits.append(hit) for hit in hits: print hit.HITId, ':', hit.Title assignments = conn.get_assignments(hit.HITId) print len(assignments), ' assignments for this HIT' for a in assignments: print a.AssignmentId, ':', a.AssignmentStatus # mongoDB mongo_conn = pm.Connection(host='localhost', port=22334) db = mongo_conn['mturk'] coll = db['rosch_pose_exp']
def get_all_hit_stats(): connection = MTurkConnection(aws_access_key_id=MTURK_ACCESS_KEY, aws_secret_access_key=MTURK_SECRET_KEY, host=MTURK_HOST) for row in [(hit.HITId, hit.expired) for hit in connection.get_all_hits()]: print "%s - %s" % row
#!flask/bin/python # Script that disable/expires all current HITS released under me as a requester. # Disable means completely delete the HIT # Expire means Workers can't view it anymore but you can still review and approve/reject it. from boto.mturk.connection import MTurkConnection from secret import SECRET_KEY,ACCESS_KEY,AMAZON_HOST #Start Configuration Variables AWS_ACCESS_KEY_ID = ACCESS_KEY AWS_SECRET_ACCESS_KEY = SECRET_KEY connection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=AMAZON_HOST) hits_lst = list(connection.get_all_hits()) print hits_lst for hit in hits_lst: print "Expiring HIT ID: ",hit.HITId connection.expire_hit(hit.HITId) #print "Disabling HIT ID: ",hit.HITId #connection.disable_hit(hit.HITId)
class Mturk(): def __init__(self): self.config = self.set_config() self.mturk = MTurkConnection( aws_access_key_id=self.config['aws_access_key_id'], aws_secret_access_key=self.config['aws_secret_access_key'], host=self.config['host']) self.mturk_tmpl = MturkTmpl() def set_config(self, config_path="config.yml"): with open(config_path, 'r') as file: config = yaml.load(file) return config def account_balance(self): account_balance = self.mturk.get_account_balance() print("Testing connection: You have a balance of: {}".format( account_balance)) def get_hits(self): return self.mturk.get_all_hits() def get_all_assignments(self, hit_id): page_size = 100 assignments = self.mturk.get_assignments(hit_id, page_size=page_size) total_records = int(assignments.TotalNumResults) get_page_assignments = lambda page: self.mturk.get_assignments( hit_id, page_size=page_size, page_number=page) page_nums = self.mturk._get_pages(page_size=page_size, total_records=total_records) assignments_sets = itertools.imap(get_page_assignments, page_nums) return itertools.chain.from_iterable(assignments_sets) def remove_old_hits(self): # Disable old hits. for hit in self.get_hits(): print("Hit {} has been removed.".format(hit.HITId)) self.mturk.disable_hit(hit.HITId) def cal_reward(self, data): read_instruction = 3.0 word_count = len(data['ents']) * 1 / 30.0 return round((read_instruction + word_count) / 60.0 * 6.0, 2) def create_hit(self, data): # These parameters define the HIT that will be created # question is what we defined above # max_assignments is the # of unique Workers you're requesting # title, description, and keywords help Workers find your HIT # duration is the # of seconds Workers have to complete your HIT # reward is what Workers will be paid when you approve their work # Check out the documentation on CreateHIT for more details response = self.mturk.create_hit( question=self.mturk_tmpl.html_question(data), max_assignments=1, title=self.config['title'], description=self.config['description'], keywords=self.config['keywords'], duration=120, reward=self.cal_reward(data)) return response
class MturkHelper(object): """ This class handles task creation for amazon mechanical task service. Amazon MTruk is used to crowdsource matching products. Initialisation : - reference : reference of the product - osm_from : the origin osm of a product - osm_to : the osm to look into """ if settings.SANDBOX: AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID else: AWS_SECRET_ACCESS_KEY = 'e6/8e5lcCcESPKT/fe6kYkJtf0+7F2w7459WTJ0v' AWS_ACCESS_KEY_ID = 'AKIAIP5JQO7FQX6Q7JAQ' def __init__(self, reference = None, osm_from = None, osm_to = None, key = None, hitid = None): self.reference = reference self.osm_from = osm_from self.osm_to = osm_to self.key = key self.hitid = hitid if key is None: self.task = None else: self.task = self.get_task() self.mtc = MTurkConnection(aws_access_key_id=MturkHelper.AWS_ACCESS_KEY_ID, aws_secret_access_key=MturkHelper.AWS_SECRET_ACCESS_KEY, host=settings.HOST) def get_all_reviewable_hits(self): page_size = 50 hits = self.mtc.get_reviewable_hits(page_size=page_size) print "Total results to fetch %s " % hits.TotalNumResults print "Request hits page %i" % 1 total_pages = float(hits.TotalNumResults)/page_size int_total= int(total_pages) if(total_pages-int_total>0): total_pages = int_total+1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = self.mtc.get_reviewable_hits(page_size=page_size,page_number=pn) hits.extend(temp_hits) return hits def get_hits(self, validate = False, all_hits = False): if not all_hits: hits = self.get_all_reviewable_hits() else: hits = self.mtc.get_all_hits() for hit in hits: print "####################" print "--------------------" print "HitId = %s"%(hit.HITId) assignments = self.mtc.get_assignments(hit.HITId) # Getting task associated to hit task = Task.objects.filter(hitId = hit.HITId) print 'Number of corresponding tasks = %d'%len(task) if len(task)>0: task = task[0] else: task = None for assignment in assignments: print "AssignmentId = %s"%(assignment.AssignmentId) print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: qid = question_form_answer.qid if qid == 'flagged': for value in question_form_answer.fields: # Saving resultTask if task is not None: print 'Saving result task, result = %s'%(value) resulttask, created = ResultTask.objects.get_or_create(task = task, assignementId = assignment.AssignmentId, workerId = assignment.WorkerId) resulttask.reference = value resulttask.save() elif validate: try: self.mtc.approve_assignment(assignment.AssignmentId) except Exception, e: print e try: if validate: self.mtc.disable_hit(hit.HITId) except Exception, e: print e print "--------------------"
aws_access_key_id = config.get( 'AWS Access', 'aws_access_key_id' ), aws_secret_access_key = config.get( 'AWS Access', 'aws_secret_access_key' ), host=host) mtc = MTurkConnection( **mturkparams ) # Now let's get a list of all the assignment objects page_size = 50 hitpages = [] newhits = True #while newhits: # newhits = mtc.get_all_hits() # hitpages.append(newhits) # print dir(newhits) hitpages = list( mtc.get_all_hits() ) hitpages.extend( list( mtc.get_reviewable_hits(page_size=100) ) ) total_pages = len(hitpages) #pn = 1 #thehits = [] #while pn < total_pages: # print "Request hits page %i" % pn # temp_hits = mtc.get_reviewable_hits( page_size = page_size, page_number=pn ) # thehits.extend( temp_hits ) # pn += 1 hits = {} assignments = {} workers = {}
class MyMTurk: AWS_ACCESS_KEY_ID = 'skip' AWS_SECRET_ACCESS_KEY = 'skip' HOST_SANDBOX = 'mechanicalturk.sandbox.amazonaws.com' HOST_MTURK = 'mechanicalturk.amazonaws.com' EXTERNAL_URL = 'http://redbug0314.blogspot.com/p/imcrowd.html' def __init__(self): #connect to MTurk self.connect = MTurkConnection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, host=self.HOST_SANDBOX) #Qualification setting q = self.qualifications = Qualifications() # if required_to_preview == True unqualified user even can't view the hit. # q.add( PercentAssignmentsApprovedRequirement( comparator="GreaterThan", integer_value="95" ) ) q.add(AdultRequirement(comparator="EqualTo", integer_value="1")) def register_hit_type(self): try: reg_hit_type = self.connect.register_hit_type( title="Nine Picture!", description= "Choose some best pictures which you think is the best from following pictures.", reward=0.01, duration=60 * 30, keywords="steak, photo", approval_delay=datetime.timedelta(days=1), qual_req=self.qualifications) except MTurkRequestError as e: print "register hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body) else: self.hit_type_id = reg_hit_type print "hit type id %s" % reg_hit_type def question_form(self): qc = QuestionContent() # qc.append_field( 'Title', 'Is she hot?' ) qc.append( Binary( 'image', 'jpg', 'http://www.miranchomeatmarket.com/images/T-%20bone%20steak.jpg', 'steak')) q = Question(identifier="This is the first girl!", content=qc, answer_spec=AnswerSpecification(FreeTextAnswer()), is_required=True, display_name="This is display name") qf = QuestionForm() qf.append(q) if self.hit_type_id: try: create_hit_rs = self.connect.create_hit( hit_type=self.hit_type_id, question=qf, lifetime=datetime.timedelta(days=14), max_assignments=10, annotation="This is a annotation") except MTurkRequestError as e: print "create hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body) else: print "success!! key: %s" % create_hit_rs def question_form_formatted_content(self): qc = QuestionContent() formatted_xhtml = """\ <table border="1"> <tr> <td></td> <td align="center">1</td> <td align="center">2</td> <td align="center">3</td> </tr> <tr> <td align="right">A</td> <td align="center"><b>X</b></td> <td align="center"> </td> <td align="center"><b>O</b></td> </tr> <tr> <td align="right">B</td> <td align="center"> </td> <td align="center"><b>O</b></td> <td align="center"> </td> </tr> <tr> <td align="right">C</td> <td align="center"> </td> <td align="center"> </td> <td align="center"><b>X</b></td> </tr> <tr> <td align="center" colspan="4">It is <b>X</b>'s turn.</td> </tr> </table> """ qc.append(FormattedContent(formatted_xhtml)) q = Question( identifier="Formatted content test!", content=qc, answer_spec=AnswerSpecification( SelectionAnswer( min=1, max=5, style='checkbox', selections=[ (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSh1HXq3WyOvvG7-AgvNugKC2LzImMUvUDNTuDAPwVKuw8NZzvLN62pGYhX:farm1.static.flickr.com/21/24204504_e143536a2e.jpg', 'steak1').get_as_xml(), 'img1'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTkMoChevUBvQfmfksKDBM5oj4V2ruj6riqv7kC-_6qf9MR0igeBlJLkSI:www.miranchomeatmarket.com/images/T-%2520bone%2520steak.jpg', 'steak2').get_as_xml(), 'img2'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSttsqT7kj9siDKZg1p4fU6W9IFlMZHCFSxFd49ECJR1Bu_1QlHQwmH1DU:img4.myrecipes.com/i/recipes/ck/06/08/grilled-steak-ck-1215910-l.jpg', 'steak3').get_as_xml(), 'img3'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcRfdQ-vuNt-W4W7JZRkAmbZpE6LLA0puCQs5erSzrGtsOY8H8t-vgEzqA:www.greendiamondgrille.com/images/new/NewYorkStripSteak.jpg', 'steak4').get_as_xml(), 'img4'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTsJzCp6En1R9yvFQw7bGsSxiiQCqlMrFg7XCbcJ13G39Aa3e6ZilWW34oI:www.bunrab.com/dailyfeed/dailyfeed_images_jan-07/df07_01-08_steak.jpg', 'steak5').get_as_xml(), 'img5'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTkMoChevUBvQfmfksKDBM5oj4V2ruj6riqv7kC-_6qf9MR0igeBlJLkSI:www.miranchomeatmarket.com/images/T-%2520bone%2520steak.jpg', 'steak2').get_as_xml(), 'img6'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSttsqT7kj9siDKZg1p4fU6W9IFlMZHCFSxFd49ECJR1Bu_1QlHQwmH1DU:img4.myrecipes.com/i/recipes/ck/06/08/grilled-steak-ck-1215910-l.jpg', 'steak3').get_as_xml(), 'img7'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcRfdQ-vuNt-W4W7JZRkAmbZpE6LLA0puCQs5erSzrGtsOY8H8t-vgEzqA:www.greendiamondgrille.com/images/new/NewYorkStripSteak.jpg', 'steak4').get_as_xml(), 'img8'), (Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTsJzCp6En1R9yvFQw7bGsSxiiQCqlMrFg7XCbcJ13G39Aa3e6ZilWW34oI:www.bunrab.com/dailyfeed/dailyfeed_images_jan-07/df07_01-08_steak.jpg', 'steak5').get_as_xml(), 'img9') ], type='binary')), is_required=True, display_name="This is display name") qf = QuestionForm() qf.append(q) if self.hit_type_id: try: create_hit_rs = self.connect.create_hit( hit_type=self.hit_type_id, question=qf, lifetime=datetime.timedelta(days=14), max_assignments=1, annotation="This is a annotation") except MTurkRequestError as e: print "create hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body) else: print "success!! key: %s" % create_hit_rs def external_question(self): q = ExternalQuestion( external_url= "http://www.kernel.org/pub/software/scm/git/docs/everyday.html", frame_height=200) # keywords = ['image', 'filter', 'google'] # #create hit without id # create_hit_rs = self.connect.create_hit( question=q, lifetime=60 * 65, max_assignments=2, title="Google Image Filter", keywords=keywords, reward=0.05, duration=60 * 6, approval_delay=60 * 60, annotation='An annotation from boto external question test', response_groups=['Minimal', 'HITDetail', 'HITQuestion', 'HITAssignmentSummary', ], qualifications=self.qualifications ) #create hit with id if self.hit_type_id: try: hit = self.connect.create_hit( hit_type=self.hit_type_id, question=q, lifetime=datetime.timedelta(days=14), max_assignments=1, annotation="This is a annotation") except MTurkRequestError as e: print "register hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body) else: print "hit id: %s " % hit[0].HITId print "hit type id: %s " % hit[0].HITTypeId def get_account_balance(self): print self.connect.get_account_balance() def getHits(self): print self.connect.get_all_hits() def getHit(self, hit_id): hit_rs = self.connect.get_hit(hit_id) hit = hit_rs[0] for k, v in hit.__dict__.items(): print "%s: %s" % (k, v) def searchHits(self): print self.connect.search_hits() def getAssignments(self, hit_id): print self.connect.get_assignments(hit_id) def getReviewableHits(self): print self.connect.get_reviewable_hits()
class MyMTurk: AWS_ACCESS_KEY_ID = 'skip' AWS_SECRET_ACCESS_KEY = 'skip' HOST_SANDBOX = 'mechanicalturk.sandbox.amazonaws.com' HOST_MTURK = 'mechanicalturk.amazonaws.com' EXTERNAL_URL = 'http://redbug0314.blogspot.com/p/imcrowd.html' def __init__( self ): #connect to MTurk self.connect = MTurkConnection( self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, host=self.HOST_SANDBOX ) #Qualification setting q = self.qualifications = Qualifications() # if required_to_preview == True unqualified user even can't view the hit. # q.add( PercentAssignmentsApprovedRequirement( comparator="GreaterThan", integer_value="95" ) ) q.add( AdultRequirement( comparator="EqualTo", integer_value="1" ) ) def register_hit_type( self ): try: reg_hit_type = self.connect.register_hit_type( title="Nine Picture!", description="Choose some best pictures which you think is the best from following pictures.", reward=0.01, duration=60 * 30, keywords="steak, photo", approval_delay=datetime.timedelta( days=1 ), qual_req=self.qualifications ) except MTurkRequestError as e: print "register hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body ) else: self.hit_type_id = reg_hit_type print "hit type id %s" % reg_hit_type def question_form( self ): qc = QuestionContent() # qc.append_field( 'Title', 'Is she hot?' ) qc.append( Binary( 'image', 'jpg', 'http://www.miranchomeatmarket.com/images/T-%20bone%20steak.jpg', 'steak' ) ) q = Question( identifier="This is the first girl!", content=qc, answer_spec=AnswerSpecification( FreeTextAnswer() ), is_required=True, display_name="This is display name" ) qf = QuestionForm() qf.append( q ) if self.hit_type_id: try: create_hit_rs = self.connect.create_hit( hit_type=self.hit_type_id, question=qf, lifetime=datetime.timedelta( days=14 ), max_assignments=10, annotation="This is a annotation" ) except MTurkRequestError as e: print "create hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body ) else: print "success!! key: %s" % create_hit_rs def question_form_formatted_content( self ): qc = QuestionContent() formatted_xhtml = """\ <table border="1"> <tr> <td></td> <td align="center">1</td> <td align="center">2</td> <td align="center">3</td> </tr> <tr> <td align="right">A</td> <td align="center"><b>X</b></td> <td align="center"> </td> <td align="center"><b>O</b></td> </tr> <tr> <td align="right">B</td> <td align="center"> </td> <td align="center"><b>O</b></td> <td align="center"> </td> </tr> <tr> <td align="right">C</td> <td align="center"> </td> <td align="center"> </td> <td align="center"><b>X</b></td> </tr> <tr> <td align="center" colspan="4">It is <b>X</b>'s turn.</td> </tr> </table> """ qc.append( FormattedContent( formatted_xhtml ) ) q = Question( identifier="Formatted content test!", content=qc, answer_spec=AnswerSpecification( SelectionAnswer( min=1, max=5, style='checkbox', selections=[ ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSh1HXq3WyOvvG7-AgvNugKC2LzImMUvUDNTuDAPwVKuw8NZzvLN62pGYhX:farm1.static.flickr.com/21/24204504_e143536a2e.jpg', 'steak1' ).get_as_xml(), 'img1' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTkMoChevUBvQfmfksKDBM5oj4V2ruj6riqv7kC-_6qf9MR0igeBlJLkSI:www.miranchomeatmarket.com/images/T-%2520bone%2520steak.jpg', 'steak2' ).get_as_xml(), 'img2' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSttsqT7kj9siDKZg1p4fU6W9IFlMZHCFSxFd49ECJR1Bu_1QlHQwmH1DU:img4.myrecipes.com/i/recipes/ck/06/08/grilled-steak-ck-1215910-l.jpg', 'steak3' ).get_as_xml(), 'img3' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcRfdQ-vuNt-W4W7JZRkAmbZpE6LLA0puCQs5erSzrGtsOY8H8t-vgEzqA:www.greendiamondgrille.com/images/new/NewYorkStripSteak.jpg', 'steak4' ).get_as_xml(), 'img4' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTsJzCp6En1R9yvFQw7bGsSxiiQCqlMrFg7XCbcJ13G39Aa3e6ZilWW34oI:www.bunrab.com/dailyfeed/dailyfeed_images_jan-07/df07_01-08_steak.jpg', 'steak5' ).get_as_xml(), 'img5' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTkMoChevUBvQfmfksKDBM5oj4V2ruj6riqv7kC-_6qf9MR0igeBlJLkSI:www.miranchomeatmarket.com/images/T-%2520bone%2520steak.jpg', 'steak2' ).get_as_xml(), 'img6' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcSttsqT7kj9siDKZg1p4fU6W9IFlMZHCFSxFd49ECJR1Bu_1QlHQwmH1DU:img4.myrecipes.com/i/recipes/ck/06/08/grilled-steak-ck-1215910-l.jpg', 'steak3' ).get_as_xml(), 'img7' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcRfdQ-vuNt-W4W7JZRkAmbZpE6LLA0puCQs5erSzrGtsOY8H8t-vgEzqA:www.greendiamondgrille.com/images/new/NewYorkStripSteak.jpg', 'steak4' ).get_as_xml(), 'img8' ), ( Binary( 'image', 'jpg', 'http://images.google.com/images?q=tbn:ANd9GcTsJzCp6En1R9yvFQw7bGsSxiiQCqlMrFg7XCbcJ13G39Aa3e6ZilWW34oI:www.bunrab.com/dailyfeed/dailyfeed_images_jan-07/df07_01-08_steak.jpg', 'steak5' ).get_as_xml(), 'img9' ) ], type='binary' ) ), is_required=True, display_name="This is display name" ) qf = QuestionForm() qf.append( q ) if self.hit_type_id: try: create_hit_rs = self.connect.create_hit( hit_type=self.hit_type_id, question=qf, lifetime=datetime.timedelta( days=14 ), max_assignments=1, annotation="This is a annotation" ) except MTurkRequestError as e: print "create hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body ) else: print "success!! key: %s" % create_hit_rs def external_question( self ): q = ExternalQuestion( external_url="http://www.kernel.org/pub/software/scm/git/docs/everyday.html", frame_height=200 ) # keywords = ['image', 'filter', 'google'] # #create hit without id # create_hit_rs = self.connect.create_hit( question=q, lifetime=60 * 65, max_assignments=2, title="Google Image Filter", keywords=keywords, reward=0.05, duration=60 * 6, approval_delay=60 * 60, annotation='An annotation from boto external question test', response_groups=['Minimal', 'HITDetail', 'HITQuestion', 'HITAssignmentSummary', ], qualifications=self.qualifications ) #create hit with id if self.hit_type_id: try: hit = self.connect.create_hit( hit_type=self.hit_type_id, question=q, lifetime=datetime.timedelta( days=14 ), max_assignments=1, annotation="This is a annotation" ) except MTurkRequestError as e: print "register hit type error:\n status: %s reason: %s\n body: %s" % ( e.status, e.reason, e.body ) else: print "hit id: %s " % hit[0].HITId print "hit type id: %s " % hit[0].HITTypeId def get_account_balance( self ): print self.connect.get_account_balance() def getHits( self ): print self.connect.get_all_hits() def getHit( self, hit_id ): hit_rs = self.connect.get_hit( hit_id ) hit = hit_rs[0] for k, v in hit.__dict__.items(): print "%s: %s" % ( k, v ) def searchHits( self ): print self.connect.search_hits() def getAssignments( self, hit_id ): print self.connect.get_assignments( hit_id ) def getReviewableHits( self ): print self.connect.get_reviewable_hits()
class HitCreator(): def __init__(self): if settings.IS_DEV_ENV or settings.USE_AMT_SANDBOX: HOST = 'mechanicalturk.sandbox.amazonaws.com' else: HOST = 'mechanicalturk.amazonaws.com' self.connection = MTurkConnection( aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, host=HOST) def createHitFrom(self, audioSnippet, hitType, numIncorrectWords=None): if hitType == "fix": suffix = "fixHIT" # half cent per incorrect word, up to eight words assert isinstance(numIncorrectWords, int) amount = max(min(.05, numIncorrectWords*.005), .02) elif hitType == "check": suffix = "checkHIT" amount = 0.05 else: assert False if settings.IS_DEV_ENV: baseurl = 'https://localhost:5000/hit/' + suffix else: baseurl = "https://transcroobie.herokuapp.com/hit/" + suffix title = "Transcribe a short audio clip." description = "Transcribe the audio. Words may be cut off at the beginning"\ " or end of the segment. Do not worry about correctly"\ " transcribing these words." keywords = ["transcription"] frame_height = 800 thisDocUrl = baseurl + "?docId=" + str(audioSnippet.pk) questionform = ExternalQuestion(thisDocUrl, frame_height) resultSet = self.connection.create_hit( title=title, description=description, keywords=keywords, max_assignments=1, question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), # I don't know what response groups are ) assert len(resultSet) == 1 audioSnippet.activeHITId = resultSet[0].HITId audioSnippet.save() def deleteHit(self, hitID): try: self.connection.disable_hit(hitID) except MTurkRequestError as e: print "HIT already deleted", e def deleteAllHits(self): allHits = [hit for hit in self.connection.get_all_hits()] for hit in allHits: print "Disabling hit ", hit.HITId self.deleteHit(hit.HITId) def processHit(self, questionFormAnswers): # Process each HIT only once. This function will set activeHITId to "" # to let you know that the HIT is completed and processed. hitType = None response = None audioSnippet = None fixWords = {} for questionFormAnswer in questionFormAnswers: if questionFormAnswer.qid == "asFileId": asFileId = questionFormAnswer.fields[0] audioSnippet = get_object_or_404(AudioSnippet, pk = asFileId) elif questionFormAnswer.qid == "fixedHITResult": hitType = "fix" response = None # need to look at word_%d based on audiosnippet elif questionFormAnswer.qid.startswith("word_"): fixWords[questionFormAnswer.qid] = questionFormAnswer.fields[0] elif questionFormAnswer.qid == "checkedHITResult": hitType = "check" responseStr = questionFormAnswer.fields[0] response = [val == 'true' for val in responseStr.split(',')] numIncorrectWords = 0 if hitType == "fix": # Get the list of words marked incorrect, and count them incorrectWords = audioSnippet.incorrectWords['bools'][-1] numIncorrectWords = len(incorrectWords)-sum(incorrectWords) # Get the last prediction to interpret incorrectWords prediction = audioSnippet.predictions[-1].split() # Convert the last prediction to what was actually sent to # the user predictionSpaced = transcriptWithSpacesAndEllipses(prediction) assert len(incorrectWords) == len(predictionSpaced) words, isCorrect = combineConsecutiveDuplicates(predictionSpaced, incorrectWords) response = "" for i in xrange(len(words)): if not isCorrect[i]: response += fixWords["word_" + str(i)] + " " else: # Only add punctuation (" ") and ellipses if marked incorrect word = words[i] if word.isspace() or word == "": continue elif i == 0 and word.startswith("..."): word = word[3:] # remove initial ellipses elif i == len(words)-1 and word.endswith("..."): word = word[:-3] # remove trailing ellipses response += word.strip() + " " audioSnippet.predictions.append(response) # Always do a check after a fix completionStatus = CompletionStatus.incomplete else: audioSnippet.incorrectWords['bools'].append(response) completionStatus = self.getCompletionStatus(audioSnippet, response) if completionStatus == CompletionStatus.correct: audioSnippet.hasBeenValidated = True audioSnippet.isComplete = True elif completionStatus == CompletionStatus.givenup: audioSnippet.hasBeenValidated = False audioSnippet.isComplete = True audioSnippet.activeHITId = "" if completionStatus == CompletionStatus.incomplete: if hitType == "check": # CHECK task complete. Create a FIX task (since not # hasBeenValidated) self.createHitFrom(audioSnippet, 'fix', numIncorrectWords) elif hitType == "fix": # FIX task complete. Create a CHECK task. self.createHitFrom(audioSnippet, 'check') audioSnippet.save() def getCompletionStatus(self, audioSnippet, response): # only callwhen all hitTypes == "check" # returns a CompletionStatus MAX_NUM_PREDICTIONS = 2 completionStatus = CompletionStatus.incomplete if all(response): completionStatus = CompletionStatus.correct elif len(audioSnippet.predictions) > MAX_NUM_PREDICTIONS: completionStatus = CompletionStatus.givenup return completionStatus def processHits(self, doc): """ Returns whether or not the doc had a newly-completed HIT which was processed. """ assert not doc.completeTranscript audioSnippets = doc.audioSnippets.order_by('id') newHITCompleted = False assignments = [] for audioSnippet in audioSnippets: hitID = audioSnippet.activeHITId if not hitID: continue try: hit = self.connection.get_hit(hitID) except MTurkRequestError as e: logger.error("Perhaps this HIT no longer exists: " + str(e)) continue asgnForHit = self.connection.get_assignments(hit[0].HITId) if asgnForHit: # Hit is ready. Get the data. for asgn in asgnForHit: assignments.append(asgn) questionFormAnswers = asgn.answers[0] self.processHit(questionFormAnswers) newHITCompleted = True statuses = [a.isComplete for a in audioSnippets] if all([a.hasBeenValidated for s in statuses]) or \ all([a.isComplete for a in audioSnippets]): # Note: if the conditional is not met, predictions may be an empty # array. Don't run this next line outside of this conditional. # (Happens only in a race condition after the audioSnippet is # uploaded, and before it adds its first prediction.) responses = [a.predictions[-1] for a in audioSnippets] # All tasks complete for first time totalString = overlap.combineSeveral(responses) doc.completeTranscript = totalString doc.save() return newHITCompleted def isTaskReady(self, hitID): return len(self.connection.get_assignments(hitID)) > 0 def approveAllHits(self): # Approve hits: for assignment in self.getAllAssignments(): try: self.connection.approve_assignment(assignment.AssignmentId) except MTurkRequestError as e: # Maybe already approved? logger.error("MTurk Request Error: " + str(e)) def checkIfHitsReady(self): return True def getAllAssignments(self): allHits = [hit for hit in self.connection.get_all_hits()] # Approve hits: for hit in allHits: assignments = self.connection.get_assignments(hit.HITId) for assignment in assignments: yield assignment
# print('Assignments: {}'.format(assignments)) # To get any information about status, you have to get the HIT via get_all_hits # If you just use get_hit() it gets minimal info # currhits = {} # for h in mtc.get_all_hits(): # if h.HITId in hitids: # print(h.HITId) # currhits[h.HITId] = h # print('{}: {}'.format(len(currhits), currhits)) # # get_all_hits iterates through all your current HITs, grabbing 100 at a time # # best to break as soon as you get all the HITIds in your group # if len(currhits) == len(hitids): # break currhits = {h.HITId: h for h in mtc.get_all_hits() if h.HITId in hitids} print('{} Current HITs: {}'.format(len(currhits), sorted(currhits.keys()))) process_assignments(assignments, all_results, currhits) outkeys.extend(list(sorted(answer_keys))) # Structure of hits # foo.Amount foo.Expiration foo.IntegerValue foo.QualificationTypeId # foo.AssignmentDurationInSeconds foo.FormattedPrice foo.Keywords foo.RequesterAnnotation # foo.AutoApprovalDelayInSeconds foo.HIT foo.LocaleValue foo.RequiredToPreview # foo.Comparator foo.HITGroupId foo.MaxAssignments foo.Reward # foo.Country foo.HITId foo.NumberOfAssignmentsAvailable foo.Title # foo.CreationTime foo.HITReviewStatus foo.NumberOfAssignmentsCompleted # foo.CurrencyCode foo.HITStatus foo.NumberOfAssignmentsPending foo.expired # foo.Description foo.HITTypeId foo.QualificationRequirement
class MTurk(object): """ A class that wraps a boto.mturk.connection object and provides methods for the most common AI2 use cases """ def __init__(self, aws_access_key_id, aws_secret_access_key, host=SANDBOX_HOST): """ initializes the instance with AWS credentials and a host :param aws_access_key_id the access key id. :param aws_secret_access_key the secret access key. :param host the mturk host to connect to """ self.connection = MTurkConnection( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, host=host) self.host = host def __del__(self): """ close the connection whenever this object goes out of scope """ self.connection.close() def get_account_balance(self): """ :return the balance on the mturk account """ return self.connection.get_account_balance()[0] def _create_hit(self, params, **kwargs): """ internal helper function for creating a HIT :param params the parameters (required and optional) common to all HITs :param **kwargs any other parameters needed for a specific HIT type :return the created HIT object """ return self.connection.create_hit( title=params["title"], description=params["description"], keywords=params["keywords"], max_assignments=params["max_assignments"], reward=Price(amount=params["amount"]), qualifications=params["qualifications"], lifetime=params["lifetime"], # optional params below annotation=params.get("annotation"), **kwargs) def create_url_hit(self, params): """ creates a HIT for an external question with a specified URL :param params a dict of the HIT parameters. must contain a "url" parameter :return the created HIT object """ question = ExternalQuestion(params["url"], params["frame_height"]) return self._create_hit(params, question=question) def create_html_hit(self, params): """ creates a HIT for a question with the specified HTML :param params a dict of the HIT parameters, must contain a "html" parameter :return the created HIT object """ question = HTMLQuestion(params["html"], params["frame_height"]) return self._create_hit(params, question=question) def create_layout_hit(self, params): """ creates a HIT for a question using the supplied layout id :param params a dict of the HIT parameters, must contain a "hit_layout" parameters with the layout id, and a "layout_params" parameter that's the dict of parameters to feed to the layout. """ # create the LayoutParameters object from the supplied params layout_params = LayoutParameters([ LayoutParameter(name, value) for name, value in params["layout_params"] ]) return self._create_hit(params, hit_layout=params["hit_layout"], layout_params=layout_params) def delete_all_hits(self): """ Permanently disables/ deletes all of the user's active HITs. :param mturk_connection: active mturk connection established by user in the notebook. :return: """ my_hits = list(self.get_all_hits()) for hit in my_hits: self.connection.disable_hit(hit.HITId) def get_assignments_object_list(self, assignment_dict): """ Returns a list of "<boto.mturk.connection.Assignment object at...>" objects assignment_dict: a dictionary of HITId-assignment object pairs """ assignments = [] for entry in assignment_dict: for assignment_object in assignment_dict[entry]: assignments.append(assignment_object) return assignments def get_results_dict(self, HIT_assignments): """ Takes a list of HIT assignment objects as input. Returns a list of dictionaries of HITs containing: HIT_id: the HIT ID worker_id: the worker ID of the Turker who completed the HIT answers: a dictionary of qid-answer field value pairs """ assignment_results = [] for assignment in HIT_assignments: HIT_dict = {} HIT_dict["assignment_object"] = assignment HIT_dict["worker_Id"] = assignment.WorkerId HIT_dict["HIT_id"] = assignment.HITId answers_dict = {} for answer in assignment.answers[0]: answers_dict[answer.qid] = answer.fields HIT_dict["answers"] = answers_dict assignment_results.append(HIT_dict) return assignment_results def get_all_results(self, hits): all_results = {} for hid, assignments in self.get_assignments(hits).items(): all_results[hid] = self.get_results_dict(assignments) return all_results def get_reviewable_hits(self, annotations=None, detailed=False): """ Get all the reviewable HITs. By default returns minimal HIT objects, but will return detailed ones (by necessity) if annotations is specified or if detailed is True :param annotations an optional set of annotations to retrieve HITs for :param detailed do you want detailed HIT objects or minimal ones :return a list of HIT objects """ minimal_hits = [] page_num = 1 while True: more_hits = self.connection.get_reviewable_hits( page_size=100, page_number=page_num) if more_hits: minimal_hits.extend(more_hits) page_num += 1 else: break if detailed or annotations is not None: detailed_hits = [ self.connection.get_hit(hit.HITId, response_groups=('Minimal', 'HITDetail')) for hit in minimal_hits ] return [ hit for hit in detailed_hits if annotation_filter(annotations, hit) ] else: return minimal_hits def get_all_hits(self, annotations=None): """ Get all the HITs. :param annotations a set of annotations to get HITs for, all HITs if not specified :return a list of HIT objects """ return [ hit for hit in self.connection.get_all_hits() if annotation_filter(annotations, hit) ] def get_assignments(self, hits=None, hit_ids=None, status=None): """ Retrieves individual assignments associated with the supplied HITs :param hits the HITs to get assignments for :status HIT status to filter by :return dict from HITId to lists of assignments """ if hit_ids is None: hit_ids = [hit.HITId for hit in hits] return { hit_id: self.connection.get_assignments(hit_id, status=status) for hit_id in hit_ids } def disable_hit(self, hit=None, hit_id=None): """ disable the specified hit (or the hit with the specified id). must specify either `hit` or `hit_id` :param hit a HIT object to disable :param hit_id a HITId to disable """ hit_id = hit.HITId if hit is not None else hit_id return self.connection.disable_hit(hit_id) def approve_assignment(self, assignment=None, assignment_id=None, feedback=None): """ approve the specified assignment (or the assigment with the specified id) must specify either `assignment` or `assignment_id` :param assignment an assignment object to approve :param assignment_id an AssignmentId to approve :param feedback optional feedback for the worker """ assignment_id = assignment.AssignmentId if assignment is not None else assignment_id return self.connection.approve_assignment(assignment_id, feedback) def reject_assignment(self, assignment=None, assignment_id=None, feedback=None): """ reject the specified assignment (or the assigment with the specified id) must specify either `assignment` or `assignment_id` :param assignment an assignment object to reject :param assignment_id an AssignmentId to reject :param feedback optional feedback for the worker """ assignment_id = assignment.AssignmentId if assignment is not None else assignment_id return self.connection.reject_assignment(assignment_id, feedback)
class TranscriptionPipelineHandler(): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn, TEMPLATE_DIR) self.mh = MongoTranscriptionHandler() self.wh = WavHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.logger = logging.getLogger( "transcription_engine.transcription_pipeline_handler") def audio_clip_referenced_to_hit(self, priority=1, max_queue_size=10): for audio_clip in self.mh.get_artifacts_by_state( "audio_clips", "Referenced"): audio_clip_id = audio_clip["_id"] self.mh.queue_clip(audio_clip_id, priority, max_queue_size) response = self.audio_clip_queue_to_hit() def audio_clip_queued_to_hit(self, priority=1, max_queue_size=10): for audio_clip in self.mh.get_artifacts("audio_clips", {"state": "Queued"}): audio_clip_id = audio_clip["_id"] response = self.audio_clip_queue_to_hit() #=================================================================== # elif state == "Hit": # print("In hit: %s"%audio_clip_url) #=================================================================== def audio_clip_queue_to_hit(self, cost_sensitive=True): """Take queued audio clips from the audio clip queue put them in a hit and create the hit. If successful, update the audio clip state.""" clip_queue = self.mh.get_audio_clip_queue() clip_pairs = self.mh.get_audio_clip_pairs(clip_queue) if clip_pairs: hit_title = "Audio Transcription" question_title = "List and Transcribe" description = "Transcribe the audio clip by typing the words the person says in order." keywords = "audio, transcription, audio transcription" if cost_sensitive: reward_per_clip = 0.02 max_assignments = 3 estimated_cost = self.hh.estimate_html_HIT_cost( clip_pairs, reward_per_clip, max_assignments) clips_in_hits = self.mh.clips_already_in_hit(clip_pairs) if clips_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_audio_clips_from_queue(clips_in_hits) elif self.balance - estimated_cost >= 250: #if we have enough money, create the HIT response = self.hh.make_html_transcription_HIT( clip_pairs, hit_title, question_title, description, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len( response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_audio_clips_from_queue(clip_queue) audio_clip_ids = [ w["audio_clip_id"] for w in clip_queue ] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_transcription_hit_artifact( hit_id, hit_type_id, clip_queue, "New") self.logger.info("Successfully created HIT: %s" % hit_id) return self.mh.update_audio_clips_state( audio_clip_ids, "Hit") else: pass return False def load_assignments_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_ids.append(assignment.AssignmentId) if self.mh.get_artifact("assignments", {"_id": assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue else: have_all_assignments = False transcription_ids = [] transcription_dicts = self.ah.get_assignment_submitted_transcriptions( assignment) if transcription_dicts and len(transcription_dicts) == 10: pass for transcription in transcription_dicts: if not self.mh.get_artifact_by_id( "audio_clips", transcription["audio_clip_id"]): self.logger.info("Assignment(%s) with unknown audio clip(%s) skipped"%\ (assignment.AssignmentId,transcription["audio_clip_id"])) break self.mh.update_transcription_state(transcription, "Submitted") self.mh.update_audio_clips_state( [transcription["audio_clip_id"]], "Submitted") transcription_ids.append( self.mh.get_artifact( "transcriptions", { "audio_clip_id": transcription["audio_clip_id"], "assignment_id": transcription["assignment_id"] }, "_id")) else: self.mh.create_assignment_artifact(assignment, transcription_ids, "Submitted") if assignments and not have_all_assignments: self.mh.update_transcription_hit_state(hit_id, "Submitted") print("Transcriptions HIT(%s) submitted assignments: %s " % (hit_id, assignment_ids)) def assignment_submitted_approved(self): """For all submitted assignments, if an answered question has a reference transcription, check the WER. If all the answered questions with reference transcriptions have an acceptable WER, approve the assignment and update the audio clips and transcriptions.""" assignments = self.mh.get_artifacts_by_state("assignments", "Submitted") rejected_feedback = "I'm sorry but your work in assignment(%s) was rejected because" +\ " one or more of your transcriptions " +\ " had a word error rate above the maximum acceptable"+\ " word error rate of %s. Omitted words and words that "+\ " differed by more than %s "+\ " characters were counted as an error." accepted_feedback = "Your average word error rate on assignment(%s) was %s."+\ " Assignment accepted! Thanks for your hard work." for assignment in assignments: assignment_id = assignment["_id"] transcription_ids = assignment["transcriptions"] transcriptions = self.mh.get_artifacts("transcriptions", "_id", transcription_ids) worker_id = assignment["worker_id"] worker_id = self.mh.create_worker_artifact(worker_id) approved, average_wer = self.filter.approve_assignment( transcriptions) if approved: try: self.conn.approve_assignment( assignment_id, accepted_feedback % (assignment_id, average_wer)) except MTurkRequestError as e: print(e) else: self.mh.update_assignment_state(assignment, "Approved") for transcription in transcriptions: #Approve transcriptions without references in the same assignment reference_id = self.mh.get_artifact_by_id( "audio_clips", transcription["audio_clip_id"], "reference_transcription_id") if not reference_id: self.mh.update_transcription_state( transcription, "Approved") print("Approved transcription ids: %s" % transcription_ids) else: #Don't deny for now feedback = rejected_feedback % (assignment_id, self.filter.WER_THRESHOLD, self.filter.CER_THRESHOLD) self.logger.info(feedback) self.conn.reject_assignment(assignment_id, feedback) self.mh.update_assignment_state(assignment, "Denied") #print("Assignments not aproved %s "%denied) #Update the worker if approved: self.mh.add_assignment_to_worker(worker_id, (assignment_id, average_wer)) def _load_rm_audio_source_file_to_clipped( self, file_dir, prompt_file_uri, base_clip_dir, sample_rate=16000, http_base_url="http://www.cis.upenn.edu/~tturpen/wavs/", init_clip_count=200): """For an audio directory, see which files are new and not an audio source already """ prompt_dict = self.ph.get_prompts(prompt_file_uri) count = 0 for root, dirs, files in os.walk(file_dir): for f in files: if count == init_clip_count: return system_uri = os.path.join(root, f) out_uri = system_uri.strip(".sph") + ".wav" out_uri = os.path.basename(out_uri) out_uri = os.path.join(root, (out_uri)) spkr_id = str(os.path.relpath(root, file_dir)) #sph to wav if not f.endswith(".wav") and not os.path.exists(out_uri): try: self.wh.sph_to_wav(system_uri, out_uri=out_uri) except WavHandlerException as e: self.logger.error("Unable to create wav from sph: " + str(e)) if os.path.exists(out_uri) and out_uri.endswith(".wav"): #create audio source artifact count += 1 wav_filename = os.path.basename(out_uri) prompt_id = os.path.basename(out_uri).strip(".wav").upper() encoding = ".wav" sample_rate = 16000 disk_space = os.stat(out_uri).st_size length_seconds = self.wh.get_audio_length(out_uri) if prompt_id in prompt_dict: transcription_prompt = prompt_dict[prompt_id] else: #No prompt found raise PromptNotFound source_id = self.mh.create_audio_source_artifact( out_uri, disk_space, length_seconds, sample_rate, spkr_id, encoding) #create audio clip artifact audio_clip_uri = os.path.join(base_clip_dir, spkr_id, wav_filename) clip_dir = os.path.dirname(audio_clip_uri) if not os.path.exists(clip_dir): os.makedirs(clip_dir) if not os.path.exists(audio_clip_uri): copyfile(out_uri, audio_clip_uri) #http_url http_url = os.path.join(http_base_url, spkr_id, wav_filename) clip_id = self.mh.create_audio_clip_artifact( source_id, 0, -1, audio_clip_uri, http_url, length_seconds, disk_space) #Update the audio source, updates state too self.mh.update_audio_source_audio_clip(source_id, clip_id) #Create the reference transcription artifact transcription_id = self.mh.create_reference_transcription_artifact( clip_id, transcription_prompt, "Gold") #Completes audio clip to Referenced self.mh.update_audio_clip_reference_transcription( clip_id, transcription_id) def all_workers_liveness(self): workers = self.mh.get_all_workers() for worker in workers: worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments(worker) print("Worker(%s) assignments, approved(%s) denied(%s)" % (worker["_id"], approved, denied)) selection = input( "1. Show denied transcriptions and references.\n" + "2. Show accepted transcriptions and references.\n" + "3. Show both denied and accepted transcriptions.") if selection == 1 or selection == 3: print("Approved transcriptions") for assignment_id in approved: transcription_pairs = self.mh.get_transcription_pairs( assignment_id) for pair in transcription_pairs: print("Reference:\n\t%s\nHypothesis:\n\t%s\n" % (pair[0], pair[1])) if selection == 2 or selection == 3: print("Denied transcriptions") for assignment_id in denied: transcription_pairs = self.mh.get_transcription_pairs( assignment_id) for pair in transcription_pairs: print("Reference:\n\t%s\nHypothesis:\n\t%s\n" % (pair[0], pair[1])) def stats(self): workers = self.mh.get_all_workers() all_wer_per_approved_assignment = 0.0 total_accepted = 0.0 for worker in workers: worker_wer = 0.0 worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments_wer(worker) for w in approved: all_wer_per_approved_assignment += float(w[1]) worker_wer += float(w[1]) total_accepted += 1 if approved: worker_average_wer = worker_wer / len(approved) print("%s,%s" % (len(approved), worker_average_wer)) #print("Worker(%s) approved assignments(%s)\n denied assignments(%s)"%(worker_id,approved,denied)) av = all_wer_per_approved_assignment / total_accepted print("Average WER per assignment(%s)" % (av)) def get_assignment_stats(self): self.effective_hourly_wage_for_approved_assignments(.20) def effective_hourly_wage_for_approved_assignments(self, reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state( "assignments", "Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: if "SubmitTime" in assignment: accepted = datetime.datetime.strptime(assignment["AcceptTime"], "%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime( assignment["SubmitTime"], "%Y-%m-%dT%H:%M:%SZ") else: pass total += submitted - accepted count += 1 seconds_per_assignment = total.total_seconds() / count effective_hourly_wage = 60.0 * 60.0 / seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s" % (seconds_per_assignment, reward_per_assignment, effective_hourly_wage)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() for hit in hits: hit_id = hit.HITId print("HIT ID: %s" % hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)" ) == "y": try: self.conn.disable_hit(hit_id) clips = self.mh.get_artifact("transcription_hits", {"_id": hit_id}, "clips") self.mh.remove_transcription_hit(hit_id) self.mh.update_audio_clips_state(clips, "Referenced") except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)" % len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/ind_trn" #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" base_clip_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/clips" selection = 0 init_clip_count = 10000 while selection != "11": selection = raw_input( """Audio Source file to Audio Clip Approved Pipeline:\n 1: AudioSource-FileToClipped: Initialize Resource Management audio source files to %d queueable(Referenced) clips 2: AudioClip-ReferencedToHit: Queue all referenced audio clips and create a HIT if the queue is full. 3: AudioClip-HitToSubmitted: Check all submitted assignments for Transcriptions. 4: AudioClip-SubmittedToApproved: Check all submitted clips against their reference. 5: Review Current Hits 6: Worker liveness 7: Account balance 8: Worker stats 9: Recalculate worker WER 10: Assignment Stats 11: Exit """ % init_clip_count) #selection = "5" if selection == "1": self._load_rm_audio_source_file_to_clipped( audio_file_dir, prompt_file_uri, base_clip_dir, init_clip_count=init_clip_count) elif selection == "2": self.audio_clip_referenced_to_hit() elif selection == "3": self.load_assignments_hit_to_submitted() elif selection == "4": self.assignment_submitted_approved() elif selection == "5": self.allhits_liveness() elif selection == "6": self.all_workers_liveness() elif selection == "7": print("Account balance: %s" % self.balance) elif selection == "8": self.stats() elif selection == "9": self.recalculate_worker_assignment_wer() elif selection == "10": self.get_assignment_stats() # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime) # def recalculate_worker_assignment_wer(self): # """For all submitted assignments, # if an answered question has a reference transcription, # check the WER. # If all the answered questions with reference transcriptions # have an acceptable WER, approve the assignment and update # the audio clips and transcriptions.""" # assignments = self.mh.get_artifacts("assignments",{"state":"Approved"}) # for assignment in assignments: # assignment_id = assignment["_id"] # denied = [] # #If no transcriptions have references then we automatically approve the HIT # approved = True # transcription_ids = assignment["transcriptions"] # transcriptions = self.mh.get_transcriptions("_id",transcription_ids) # worker_id = assignment["worker_id"] # worker_id = self.mh.create_worker_artifact(worker_id) # # max_rej_wer = (0.0,0.0) # total_wer = 0.0 # for transcription in transcriptions: # #Normalize the transcription # #self.mh.normalize_transcription # reference_id = self.mh.get_audio_clip_by_id(transcription["audio_clip_id"],"reference_transcription_id") # if reference_id: # reference_transcription = self.mh.get_reference_transcription({"_id": reference_id}, # "transcription") # new_transcription = transcription["transcription"].split(" ") # if reference_transcription: # transcription_wer = cer_wer(reference_transcription,new_transcription) # total_wer += transcription_wer # if transcription_wer < WER_THRESHOLD: # self.logger.info("WER for transcription(%s) %d"%(transcription["transcription"],transcription_wer)) # else: # max_rej_wer = (transcription_wer,WER_THRESHOLD) # denied.append((reference_transcription,new_transcription)) # approved = False # average_wer = total_wer/len(transcriptions) # #Update the worker # self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer))
def deleteAllHits(): # this function should probably take an input parameter # of a pickle file with the hits to be disposed... mtc = MTurkConnection(host=_host) for hit in mtc.get_all_hits(): mtc.disable_hit( hit.HITId )
title = document.title url = document.url print url, title, text tuple_list.append([url, title, ' '.join(text)]) print len(tuple_list) print 'Retrieved', len(tuple_list) print 'Missing: ', len(missing) print missing print 'Sending it to mechanicalturk' mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST, is_secure=True, https_connection_factory=(https_connection_factory, ())) print mtc.get_all_hits() fact = MTurkSurveyFactory() questionForms = fact.buildSurvey(tuple_list) print len(questionForms) missing_forms = [] for questionForm in questionForms: try: fact.submitHITs(mtc=mtc, questionForms=[questionForm]) except: missing_forms.extend(questionForm) print "Unexpected error:", sys.exc_info()[0] print len(missing_forms), ' forms could not be submitted' print missing_forms
question_form.append(overview) #question_form.append(q1) question_form.append(q2) #--------------- CREATE THE HIT ------------------- HIT = mtc.create_hit(questions=question_form, max_assignments=1, title=title, description=description, keywords=keywords, duration = 60*5, reward=0.05)dvz for hit in HIT: print hit.HITId print(mtc.get_all_hits()) print ("https://workersandbox.mturk.com/mturk/preview?groupId="+hit.HITTypeId); #resp = requests.get('https://rest-stage.sqor.com/feeds/aggregate/sqor?limit=1000&offset=0',verify=False) #print (resp.text) #if resp.status_code != 200: # # This means something went wrong. # raise ApiError('GET /tasks/ {}'.format(resp.status_code)) #for todo_item in resp.json(): # print('{}'.format(todo_item['id'])) #s=FileUploadURL(mtc); #print(s);
import numpy as np import pandas as pd import ast from boto.mturk.connection import MTurkRequestError from boto.mturk.connection import MTurkConnection from secret import SECRET_KEY, ACCESS_KEY, AMAZON_HOST #Start Configuration Variables AWS_ACCESS_KEY_ID = ACCESS_KEY AWS_SECRET_ACCESS_KEY = SECRET_KEY connection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=AMAZON_HOST) all_hits = [hit for hit in connection.get_all_hits()] all_reviewable_hits = [hit for hit in connection.get_reviewable_hits()] for hit in all_hits: assignments = connection.get_assignments(hit.HITId) print assignments for assignment in assignments: print assignment print "Answers of the worker %s" % assignment.WorkerId
print 'Using API file "%s"' % api_path else: print 'ERROR: API file argument "%s" does not exist' % api_path exit(1) # Set up MTURK connection api_secrets = read_json(api_path) mtk = MTurkConnection( aws_access_key_id = api_secrets['mt_aws_key'], aws_secret_access_key = api_secrets['mt_aws_secret_key'], host = api_secrets['mt_host'] ) # Get our HIT (the latest with its name) target_hit_title = api_secrets['mt_hit_title'] matching_hits = [hit for hit in mtk.get_all_hits() if hit.Title == target_hit_title and hit.HITStatus != 'Disposed'] hit = matching_hits[-1] if len(matching_hits) > 0 else None if hit != None: print 'Found HIT "%s" (ID# %s)' % (hit.Title, hit.HITId) else: print 'ERROR: No matching HIT "%s" found' % target_hit_title exit(1) # Find all approved assignments hit_id = hit.HITId approved = [asgn for asgn in mtk.get_assignments(hit_id, page_size=100) if asgn.AssignmentStatus == 'Approved'] # Load an existing schedule, if it exists schedule = read_json(schedule_path) if os.path.isfile(schedule_path) else {} # Issue out reminders 12 hours after submission, then every 24 after that, until the end of the observation period
def handle(self, *args, **options): # check args if len(args) != 1: raise CommandError("Please specify one argument.") # set up filters if there are any filters = {} if options['filter']: filter_list = options['filter'].split(',') for filter_item in filter_list: items = filter_item.strip().split('=') filters[items[0]] = items[1] # create a connection mturk = MTurkConnection( getattr(settings, 'MTURK_AWS_KEY', settings.MEDIASYNC['AWS_KEY']), getattr(settings, 'MTURK_AWS_SECRET', settings.MEDIASYNC['AWS_SECRET']), host = 'mechanicalturk.sandbox.amazonaws.com' if options['sandbox'] else 'mechanicalturk.amazonaws.com' ) results = [] workers = set() for hit in mturk.get_all_hits(): # check filters if filters: if any ([getattr(hit, param, None) != filters[param] for param in filters.keys()]): print 'Skipping hit %s for failure to match filters' % hit.HITId continue row = {'td_id': hit.RequesterAnnotation, 'hit_id': hit.HITId} assignments = mturk.get_assignments(hit.HITId) answer_dict = {} for a in assignments: try: answer_dict['worker_%s' % a.WorkerId] = dict(a.answers[0][0].fields)['is_match'] workers.add(a.WorkerId) except: print 'Something weird happened with hit %s and worker %s' % (hit.HITId, a.WorkerId) row.update(answer_dict) row['num_assignments'] = len(answer_dict.keys()) row['disagreement'] = len(answer_dict.keys()) > 1 and len(set(answer_dict.values())) > 1 non_yes = [answer for answer in answer_dict.values() if answer != 'yes'] row['any_non_yes'] = len(non_yes) > 0 row['majority_non_yes'] = len(non_yes) >= row['num_assignments'] / 2.0 if row['num_assignments'] < 3 or row['num_assignments'] != len(assignments): print 'Hit %s had %s successful assignments of %s attempted.' % (row['hit_id'], row['num_assignments'], len(assignments)) results.append(row) writer_file = open(args[0], 'wb') fields = ['td_id', 'hit_id', 'num_assignments', 'disagreement', 'any_non_yes', 'majority_non_yes'] + ['worker_%s' % worker_id for worker_id in workers] writer = csv.DictWriter(writer_file, fields, restval='', extrasaction='ignore') writer.writeheader() for row in results: writer.writerow(row)
class HaCRSTurker: def __init__(self): self.config = HaCRSUtil.get_config('../config.ini') HOST = self.config.get('mturk', 'host') AWS_ACCESS_KEY_ID = self.config.get('mturk', 'access_key_id') AWS_SECRET_ACCESS_KEY = self.config.get('mturk', 'secret_access_key') self.MTconnection = MTurkConnection( aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=HOST) self.db = HaCRSDB() def get_balance(self): #print self.MTconnection.get_account_balance() pass def expire_all_hits(self): all_hits = self.MTconnection.get_all_hits() for hit in all_hits: if hit.expired: continue try: self.MTconnection.expire_hit(hit.HITId) #print 'Expired HIT' except Exception as e: #print 'Could not expire: {}'.format(e) pass def delete_all_mturk_hits(self): all_hits = self.MTconnection.get_all_hits() for hit in all_hits: #print 'expire/dispose' self.MTconnection.expire_hit(hit.HITId) self.MTconnection.dispose_hit(hit.HITId) def get_all_mturk_hits(self): all_hits = self.MTconnection.get_all_hits() return all_hits # TODO: HITs available via API, but not via Amazon Web Sandbox def push_tasklet_mturk(self, keywords): sdescription = self.config.get('mturk', 'shortdescr') frame_height = self.config.get('mturk', 'frameheight') #url = "https://cgcturk.hacked.jp/tasklet/{}/".format(tasklet['id']) url = "https://cgcturk.hacked.jp/pick_tasklet/{}/".format(keywords) #keywords = tasklet['keywords'] #amount = tasklet['amount'] if keywords == 'easy': amount = 1.00 elif keywords in ['medium', 'hard', 'very_hard']: amount = 2.00 elif keywords == 'priority': amount = 4.00 else: #print 'Error' sys.exit(1) questionform = ExternalQuestion(url, frame_height) title = 'HELP AN AI!!! We are students building an artificial intelligence to find bugs in programs to keep the internet safe' sdescription = 'We are students building an artificial intelligence system that finds bugs in programs and keeps the internet safe from malware. BUT IT NEEDS YOUR HELP! Play with programs to find functions that it missed, and get $$$!' hit_result = self.MTconnection.create_hit( title='[{}] {}'.format(keywords, title), description=sdescription, keywords=keywords, max_assignments=1, question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), # ? ) assert len(hit_result) == 1 mturkid = self.db.create_mturk_resource(hit_result[0].HITId, hit_result[0].HITGroupId) #self.db.add_mturk_tasklet_association(tasklet['id'], mturkid) #self.db.commit() return mturkid, hit_result def push_tasks_mturk(self): frame_height = self.config.get('mturk', 'frameheight') amount = 0.01 tasklets = self.db.get_unassigned_tasklets() sdescription = self.config.get('mturk', 'shortdescr') for tasklet in tasklets: #print 'pushing!' url = "https://cgcturk.hacked.jp/tasklet/{}/".format(tasklet['id']) keywords = ["easy"] questionform = ExternalQuestion(url, frame_height) hit_result = self.MTconnection.create_hit( title=HaCRSUtil.get_tasklet_name(tasklet), description=sdescription, keywords=keywords, max_assignments=1, question=questionform, reward=Price(amount=amount), response_groups=('Minimal', 'HITDetail'), # ? ) assert len(hit_result) == 1 mturkid = self.db.create_mturk_resource(hit_result[0].HITId, hit_result[0].HITGroupId) self.db.add_mturk_tasklet_association(tasklet['id'], mturkid) self.db.commit() def show_seed_tasklets(self): pprint(self.db.get_seed_tasklets()) def get_hit(self, hitid): try: hit = self.MTconnection.get_hit(hitid) except Exception as e: return None if hit != None: return hit[0] def get_assignment_from_hit(self, hitid): try: assignments = self.MTconnection.get_assignments(hitid) return assignments[0] except Exception as e: return None def get_approved_seeding_tasklets(self): for program in json.load( open(self.config.get('general', 'programsjson'))): pid = self.db.lookup_program(program) program = None approved = set() for tasklet in self.db.get_latest_seed_tasklets(): turkinfos = self.db.get_mturk_infos(tasklet['id']) try: #hit = self.MTconnection.get_hit(turkinfos['hitid']) assignments = self.MTconnection.get_assignments( turkinfos['hitid']) if len(assignments) == 0: continue if assignments[0].AssignmentStatus == 'Approved': approved.add(self.db.get_tasklet_program(tasklet['id'])) except Exception as e: #print e pass return list(approved)
host = "mechanicalturk.amazonaws.com" res_dir = res_dir + "production/" # Open log file if log == "y": log_con = open(res_dir+start_time+".log", "w") # Open MTurk connection if access != "" and secret != "": mturk = MTurkConnection(host = host, aws_access_key_id=access, aws_secret_access_key=secret) else: mturk = MTurkConnection(host = host) # Download reviewable HITs reviewable_assignments = list() hits = mturk.get_all_hits() hit_list = map(lambda h: h, hits) for h in hit_list: max_page = False m = 10 p = 1 while not max_page: assignments = mturk.get_assignments(h.HITId, page_size=m, page_number=p) if len(assignments) > 0: reviewable_assignments.extend(assignments) if(len(assignments) < m): max_page = True else: p += 1 else: max_page =True
fta = SelectionAnswer(min=1, max=1,style='dropdown', selections=height, type='text', other=False) q = Question(identifier='design', content=qc, answer_spec=AnswerSpecification(fta), is_required=True) question_form = QuestionForm() question_form.append(overview) question_form.append(q) #--------------- CREATE THE HIT ------------------- HIT = mtc.create_hit(questions=question_form, max_assignments=1, title=title, description=description, keywords=keywords, duration = 60*5, reward=0.05) for hit in HIT: print hit.HITId print(mtc.get_all_hits()) print ("https://workersandbox.mturk.com/mturk/preview?groupId="+hit.HITTypeId);
class MTurk(object): def __init__(self, app=None): self.host = 'https://mechanicalturk.sandbox.amazonaws.com' self.secret_key = None self.access_id = None self.app = app if app is not None: self.init_app(app) def init_app(self, app): app.config.setdefault('MTURK_SECRET_KEY', None) app.config.setdefault('MTURK_ACCESS_ID', None) app.config.setdefault('MTURK_SANDBOX', True) self.update_credentials(app.config['MTURK_ACCESS_ID'], app.config['MTURK_SECRET_KEY']) self.is_sandbox = app.config['MTURK_SANDBOX'] self.valid_login = self.verify_aws_login() def update_credentials(self, aws_access_key_id, aws_secret_access_key): self.aws_access_key_id = aws_access_key_id self.aws_secret_access_key = aws_secret_access_key def verify_aws_login(self): if ((self.aws_secret_access_key is None) or (self.aws_access_key_id is None)): logging.warning('No AWS keys found in app configuration') else: host = 'mechanicalturk.amazonaws.com' params = dict(aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, host=host) self.mtc = MTurkConnection(**params) try: self.mtc.get_account_balance() except MTurkRequestError as e: return dict(success=False, message=e.error_message) else: return True def connect_to_turk(self): if not self.valid_login: logging.warning( 'Sorry, unable to connect to Amazon Mechanical Turk. Please check your credentials' ) return False if self.is_sandbox: host = 'mechanicalturk.sandbox.amazonaws.com' else: host = 'mechanicalturk.amazonaws.com' mturkparams = dict(aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, host=host) self.mtc = MTurkConnection(**mturkparams) return True def get_account_balance(self): if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: balance = self.mtc.get_account_balance() except MTurkRequestError as e: return dict(success=False, message=e.error_message) else: return balance def get_reviewable_hits(self): if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: hits = self.mtc.get_all_hits() except MTurkRequestError as e: return dict(success=False, message=e.error_message) reviewable_hits = [ hit for hit in hits if hit.HITStatus == "Reviewable" or hit.HITStatus == "Reviewing" ] hits_data = [ MTurkHIT({ 'hitid': hit.HITId, 'title': hit.Title, 'status': hit.HITStatus, 'max_assignments': hit.MaxAssignments, 'number_assignments_completed': hit.NumberOfAssignmentsCompleted, 'number_assignments_pending': hit.NumberOfAssignmentsPending, 'number_assignments_available': hit.NumberOfAssignmentsAvailable, 'creation_time': hit.CreationTime, 'expiration': hit.Expiration }) for hit in reviewable_hits ] return hits_data def get_all_hits(self): """ Get all HITs """ if not self.connect_to_turk(): return False try: hits = self.mtc.get_all_hits() except MTurkRequestError as e: return dict(success=False, message=e.error_message) hits_data = [ MTurkHIT({ 'hitid': hit.HITId, 'title': hit.Title, 'status': hit.HITStatus, 'max_assignments': hit.MaxAssignments, 'number_assignments_completed': hit.NumberOfAssignmentsCompleted, 'number_assignments_pending': hit.NumberOfAssignmentsPending, 'number_assignments_available': hit.NumberOfAssignmentsAvailable, 'creation_time': hit.CreationTime, 'expiration': hit.Expiration, }) for hit in hits ] return hits_data def get_active_hits(self): """ Get active HITs """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') # hits = self.mtc.search_hits() try: hits = self.mtc.get_all_hits() except MTurkRequestError as e: return dict(success=False, message=e.error_message) active_hits = [hit for hit in hits if not hit.expired] hits_data = [ MTurkHIT({ 'hitid': hit.HITId, 'title': hit.Title, 'status': hit.HITStatus, 'max_assignments': hit.MaxAssignments, 'number_assignments_completed': hit.NumberOfAssignmentsCompleted, 'number_assignments_pending': hit.NumberOfAssignmentsPending, 'number_assignments_available': hit.NumberOfAssignmentsAvailable, 'creation_time': hit.CreationTime, 'expiration': hit.Expiration, }) for hit in active_hits ] return hits_data def get_hit(self, hit_id, response_groups=None): if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: hit = self.mtc.get_hit(hit_id, response_groups)[0] except MTurkRequestError as e: return False return hit def get_workers(self, assignment_status=None): """ Get workers """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: hits = self.mtc.search_hits(sort_direction='Descending', page_size=20) except MTurkRequestError as e: return False hit_ids = [hit.HITId for hit in hits] workers_nested = [ self.mtc.get_assignments(hit_id, status=assignment_status, sort_by='SubmitTime', page_size=100) for hit_id in hit_ids ] workers = [val for subl in workers_nested for val in subl] # Flatten nested lists worker_data = [{ 'hitId': worker.HITId, 'assignmentId': worker.AssignmentId, 'workerId': worker.WorkerId, 'submit_time': worker.SubmitTime, 'accept_time': worker.AcceptTime, 'status': worker.AssignmentStatus, 'completion_code': worker.answers[0][0].fields[0] } for worker in workers] return worker_data def bonus_worker(self, assignment_id, amount, reason=""): """ Bonus worker """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: bonus = MTurkConnection.get_price_as_price(amount) assignment = self.mtc.get_assignment(assignment_id)[0] worker_id = assignment.WorkerId self.mtc.grant_bonus(worker_id, assignment_id, bonus, reason) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def approve_worker(self, assignment_id, feedback=None): """ Approve worker """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: self.mtc.approve_assignment(assignment_id, feedback=feedback) return True except MTurkRequestError as e: return False def reject_worker(self, assignment_id): """ Reject worker """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: self.mtc.reject_assignment(assignment_id, feedback=None) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def unreject_worker(self, assignment_id): """ Unreject worker """ if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: self.mtc.approve_rejected_assignment(assignment_id) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def assign_qualification(self, qualification_type_id, worker_id, value=1, send_notification=True): if not self.connect_to_turk(): return dict(success=False, message='Could not connect to AWS') try: self.mtc.assign_qualification(qualification_type_id, worker_id, value, send_notification) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def revoke_qualification(self, subject_id, qualification_type_id, reason=None): if not self.connect_to_turk(): return False try: self.mtc.revoke_qualification(subject_id, qualification_type_id, reason) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def notify_worker(self, worker_id, subject, message_text): if not self.connect_to_turk(): return False try: self.mtc.notify_workers(worker_id, subject, message_text) return True except MTurkRequestError as e: return dict(success=False, message=e.error_message) def list_workers_with_qualification(self, qualification_type_id): if not self.connect_to_turk(): return False try: workers = self.mtc.get_all_qualifications_for_qual_type( qualification_type_id) except MTurkRequestError as e: return dict(success=False, message=e.error_message) workers = [w.SubjectId for w in workers] return workers
class ElicitationPipelineHandler(object): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] try: self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) except Exception as e: print(e) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn, TEMPLATE_DIR) self.mh = MongoElicitationHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.batch_cost = 1 if self.balance > self.batch_cost: self.balance = self.batch_cost else: raise IOError self.logger = logging.getLogger( "transcription_engine.elicitation_pipeline_handler") def load_PromptSource_RawToList(self, prompt_file_uri): """Create the prompt artifacts from the source.""" prompt_dict = self.ph.get_prompts(prompt_file_uri) disk_space = os.stat(prompt_file_uri).st_size source_id = self.mh.create_prompt_source_artifact( prompt_file_uri, disk_space, len(prompt_dict)) normalizer = Normalize() for key in prompt_dict: prompt, line_number = prompt_dict[key] normalized_prompt = normalizer.rm_prompt_normalization(prompt) self.mh.create_prompt_artifact(source_id, prompt, normalized_prompt, line_number, key, len(prompt)) def load_assignment_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", {"_id": assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue #pass else: have_all_assignments = False recording_ids = [] prompt_id_tag = "prompt_id" recording_url_tag = "recording_url" worker_id_tag = "worker_id" recording_dict = self.ah.get_assignment_submitted_text_dict( assignment, prompt_id_tag, recording_url_tag) worker_oid = self.mh.create_worker_artifact( assignment.WorkerId) zipcode = None for recording in recording_dict: if recording[prompt_id_tag] == "zipcode": zipcode = recording[recording_url_tag] continue if not self.mh.get_artifact_by_id( "prompts", recording[prompt_id_tag]): self.logger.info("Assignment(%s) with unknown %s(%s) skipped"%\ (assignment_id,prompt_id_tag,recording[prompt_id_tag])) break recording_id = self.mh.create_recording_source_artifact( recording[prompt_id_tag], recording[recording_url_tag], recording[worker_id_tag]) if not recording_id: self.mh.create_assignment_artifact(assignment, recording_ids, zipcode=zipcode, incomplete=True) break self.mh.add_item_to_artifact_set( "prompts", recording[prompt_id_tag], "recording_sources", recording_id) recording_ids.append(recording_id) else: self.mh.create_assignment_artifact(assignment, recording_ids, zipcode=zipcode) self.mh.add_item_to_artifact_set( "elicitation_hits", hit_id, "submitted_assignments", assignment_id) self.mh.add_item_to_artifact_set( "workers", worker_oid, "submitted_assignments", assignment_id) print("Elicitation HIT(%s) submitted assignments: %s " % (hit_id, assignment_ids)) def approve_assignment_submitted_to_approved(self): """Approve all submitted assignments""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", { "_id": assignment_id, "state": "Submitted" }): #WARNING: this Approves every assignment self.conn.approve_assignment( assignment_id, "Thank you for completing this assignment!") self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) def approve_assignment_by_worker(self): """Approve all submitted assignments""" approval_comment = "Thank you for your recordings, good work, assignment approved!" denial_comment = "I'm sorry but your work was denied because %s" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId if self.mh.get_artifact("elicitation_hits", {"_id": hit_id}): assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_id = assignment.AssignmentId assignment_ids.append(assignment_id) if self.mh.get_artifact("elicitation_assignments", { "_id": assignment_id, "state": "Submitted" }): #WARNING: this Approves every assignment assignment_artifact = self.mh.get_artifact( "elicitation_assignments", {"_id": assignment_id}) recording_ids = assignment_artifact["recordings"] worker = self.mh.get_artifact( "workers", {"eid": assignment_artifact["worker_id"]}) if worker["state"] == "Approved": #If the worker is approved, approve the assignment automatically self.conn.approve_assignment( assignment_id, approval_comment) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) continue elif worker["state"] == "Rejected": self.conn.reject_assignment( assignment_id, worker["rejection_reason"]) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) continue recording_uris = [] for recording_id in recording_ids: uri = self.mh.get_artifact_by_id( "recording_sources", recording_id, "recording_uri") recording_uris.append(uri) command = ["gnome-mplayer"] + recording_uris if len(recording_uris) > 0 and recording_uris[ 0].endswith(" .wav") or recording_uris[ 0].endswith(".com.wav"): continue print("Calling: %s" % command) call(command) approve_assignment = raw_input( "Approve assignment(y/n/s)?") if approve_assignment == "s": #skip continue elif approve_assignment == "y": #accept the assignment self.conn.approve_assignment( assignment_id, approval_comment) self.mh.update_artifact_by_id( "elicitation_assignments", assignment_id, "approval_time", datetime.datetime.now()) approve_worker = raw_input("Approve worker(y/n)?") if approve_worker == "y": #approve the worker and all future assignments self.mh.update_artifact_by_id( "workers", worker["_id"], "approval_time", datetime.datetime.now()) elif approve_assignment == "n": #Reject the assignment reject_worker = raw_input( "Reject this worker's future work?") if reject_worker == "y": #Reject the worker reason = raw_input( "Reason for rejecting this worker's future work:" ) self.mh.update_artifact_by_id( "workers", worker["_id"], "rejection_reason", reason) self.conn.reject_assignment( assignment_id, denial_comment % reason + ".") else: reason = raw_input( "Why reject the assignment?") self.conn.reject_assignment( assignment_id, denial_comment % reason + ".") def get_assignment_stats(self): effective_hourly_wage = self.effective_hourly_wage_for_approved_assignments( .20) def effective_hourly_wage_for_approved_assignments(self, reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state( "elicitation_assignments", "Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: accepted = datetime.datetime.strptime(assignment["AcceptTime"], "%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime(assignment["SubmitTime"], "%Y-%m-%dT%H:%M:%SZ") total += submitted - accepted count += 1 #self.mh.update_artifact_by_id("elicitation_assignments", assignment["_id"], "SubmitTime", completion_time) seconds_per_assignment = total.total_seconds() / count effective_hourly_wage = 60.0 * 60.0 / seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s" % (seconds_per_assignment, reward_per_assignment, effective_hourly_wage)) def enqueue_prompts_and_generate_hits(self): prompts = self.mh.get_artifacts_by_state("prompts", "New") for prompt in prompts: self.mh.enqueue_prompt(prompt["_id"], 1, 5) prompt_queue = self.mh.get_prompt_queue() prompt_pairs = self.mh.get_prompt_pairs(prompt_queue) if prompt_pairs: hit_title = "Audio Elicitation" question_title = "Speak and Record your Voice" hit_description = "Speak the prompt and record your voice." keywords = "audio, elicitation, speech, recording" if cost_sensitive: reward_per_clip = 0.04 max_assignments = 2 estimated_cost = self.hh.estimate_html_HIT_cost(prompt_pairs,reward_per_clip=reward_per_clip,\ max_assignments=max_assignments) prompts_in_hits = self.mh.prompts_already_in_hit( prompt_pairs) if prompts_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_artifact_from_queue(prompts_in_hits) elif self.balance - estimated_cost >= 0: #if we have enough money, create the HIT response = self.hh.make_html_elicitation_HIT( prompt_pairs, hit_title, question_title, keywords, hit_description, max_assignments=max_assignments, reward_per_clip=reward_per_clip) # response = self.hh.make_question_form_elicitation_HIT(prompt_pairs,hit_title, # question_title, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len( response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_artifacts_from_queue( "prompt_queue", prompt_queue) prompt_ids = [w["prompt_id"] for w in prompt_queue] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_elicitation_hit_artifact( hit_id, hit_type_id, prompt_ids) self.mh.update_artifacts_by_id( "prompts", prompt_ids, "hit_id", hit_id) self.logger.info("Successfully created HIT: %s" % hit_id) else: return True print("Amount left in batch: %s out of %s" % (self.balance, self.batch_cost)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() selection = raw_input("Remove all hits with no assignments?") if selection == "y": for hit in hits: hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: try: self.conn.disable_hit(hit_id) prompts = self.mh.get_artifact("elicitation_hits", {"_id": hit_id}, "prompts") self.mh.remove_elicitation_hit(hit_id) if prompts: self.mh.update_artifacts_state("prompts", prompts) else: pass except MTurkRequestError as e: raise e return True for hit in hits: hit_id = hit.HITId print("HIT ID: %s" % hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)" ) == "y": try: self.conn.disable_hit(hit_id) prompts = self.mh.get_artifact("elicitation_hits", {"_id": hit_id}, "prompts") self.mh.remove_elicitation_hit(hit_id) if prompts: self.mh.update_artifacts_state("prompts", prompts) else: pass except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)" % len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" selection = 0 #self.get_time_submitted_for_assignments() while selection != "8": selection = raw_input( """Prompt Source raw to Elicitations-Approved Pipeline:\n 1: PromptSource-Load_RawToList: Load Resource Management 1 prompt source files to queueable prompts 2: Prompt-ReferencedToHit: Queue all referenced prompts and create a HIT if the queue is full. 3: Prompt-HitToAssignmentSubmitted: Check all submitted assignments for Elicitations and download elicitations. 4: Maintain all assignments and hits. 5: (WARNING, approves all assignments) Approve all submitted assignments. 6: Calculate assignment stats. 7: Hand approve submitted assignments by elicitation and/or by worker. 8: Exit """) if selection == "1": self.load_PromptSource_RawToList(prompt_file_uri) elif selection == "2": self.enqueue_prompts_and_generate_hits() elif selection == "3": self.load_assignment_hit_to_submitted() elif selection == "4": self.allhits_liveness() elif selection == "5": self.approve_assignment_submitted_to_approved() elif selection == "6": self.get_assignment_stats() elif selection == "7": self.approve_assignment_by_worker() else: selection = "8" # prompt_dict = self.ph.get_prompts(prompt_file_uri) # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("elicitation_assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("elicitation_assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime)
class MturkHelper(object): """ This class handles task creation for amazon mechanical task service. Amazon MTruk is used to crowdsource matching products. Initialisation : - reference : reference of the product - osm_from : the origin osm of a product - osm_to : the osm to look into """ if settings.SANDBOX: AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID else: AWS_SECRET_ACCESS_KEY = 'e6/8e5lcCcESPKT/fe6kYkJtf0+7F2w7459WTJ0v' AWS_ACCESS_KEY_ID = 'AKIAIP5JQO7FQX6Q7JAQ' def __init__(self, reference=None, osm_from=None, osm_to=None, key=None, hitid=None): self.reference = reference self.osm_from = osm_from self.osm_to = osm_to self.key = key self.hitid = hitid if key is None: self.task = None else: self.task = self.get_task() self.mtc = MTurkConnection( aws_access_key_id=MturkHelper.AWS_ACCESS_KEY_ID, aws_secret_access_key=MturkHelper.AWS_SECRET_ACCESS_KEY, host=settings.HOST) def get_all_reviewable_hits(self): page_size = 50 hits = self.mtc.get_reviewable_hits(page_size=page_size) print "Total results to fetch %s " % hits.TotalNumResults print "Request hits page %i" % 1 total_pages = float(hits.TotalNumResults) / page_size int_total = int(total_pages) if (total_pages - int_total > 0): total_pages = int_total + 1 else: total_pages = int_total pn = 1 while pn < total_pages: pn = pn + 1 print "Request hits page %i" % pn temp_hits = self.mtc.get_reviewable_hits(page_size=page_size, page_number=pn) hits.extend(temp_hits) return hits def get_hits(self, validate=False, all_hits=False): if not all_hits: hits = self.get_all_reviewable_hits() else: hits = self.mtc.get_all_hits() for hit in hits: print "####################" print "--------------------" print "HitId = %s" % (hit.HITId) assignments = self.mtc.get_assignments(hit.HITId) # Getting task associated to hit task = Task.objects.filter(hitId=hit.HITId) print 'Number of corresponding tasks = %d' % len(task) if len(task) > 0: task = task[0] else: task = None for assignment in assignments: print "AssignmentId = %s" % (assignment.AssignmentId) print "Answers of the worker %s" % assignment.WorkerId for question_form_answer in assignment.answers[0]: qid = question_form_answer.qid if qid == 'flagged': for value in question_form_answer.fields: # Saving resultTask if task is not None: print 'Saving result task, result = %s' % ( value) resulttask, created = ResultTask.objects.get_or_create( task=task, assignementId=assignment.AssignmentId, workerId=assignment.WorkerId) resulttask.reference = value resulttask.save() elif validate: try: self.mtc.approve_assignment( assignment.AssignmentId) except Exception, e: print e try: if validate: self.mtc.disable_hit(hit.HITId) except Exception, e: print e print "--------------------"
hostname = 'mechanicalturk.amazonaws.com' mturkparams = dict( aws_access_key_id = config.get( 'AWS Access', 'aws_access_key_id' ), aws_secret_access_key = config.get( 'AWS Access', 'aws_secret_access_key' ), host = hostname ) mtc = MTurkConnection( **mturkparams ) print "Reviewable:" for hit in get_all_reviewable_hits( mtc ): print hit print "HITs:" for hit in mtc.get_all_hits(): print "\tHitID:", hit.HITId print "\tAssignments:" for assignment in mtc.get_assignments(hit.HITId): print "\t\tWorker ID:", assignment.WorkerId print "\t\tAssignment ID:", assignment.AssignmentId print "\t\tSubmit URL:", "https://www.mturk.com/mturk/externalSubmit?assignmentId=%s&hitId=%s&workerId=%s" % (assignment.AssignmentId, hit.HITId, assignment.WorkerId) import urllib2, urllib values = {'assignmentId':assignment.AssignmentId, 'hitId':hit.HITId, 'workerId':assignment.WorkerId} req = urllib2.Request("https://www.mturk.com/mturk/externalSubmit", urllib.urlencode( values )) response = urllib2.urlopen(req) result = response.read() print result
#------------------------------- ACCESS_ID = raw_input("ACCESS_ID: ") SECRET_KEY = raw_input("SECRET_KEY: "); #TODO: Change from sandbox when live HOST = 'mechanicalturk.sandbox.amazonaws.com' mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) #TODO: Remove? print mtc.get_account_balance() #TODO: Remove - DELETES ALL PREVIOUS USER HITS (Resets for testing...) Reset = mtc.get_all_hits() for hit in Reset: mtc.disable_hit(hit.HITId) print "Old HIT: " + hit.HITId + " - Disabled" #------------------------------- #-------- HIT Generation ------- #------------------------------- HIT_IDs = HITGeneration.GenerateCaptionHIT(mtc, count, assignmentNum, embedded_urls) Completed_HITs = [] #Used to link caption and validation HITs Accepted_Answers = [] #Used to build the SRT File CaptionAndValidate.CaptionAndValidationLoop(mtc, HIT_IDs, count, assignmentNum, embedded_urls, Completed_HITs, Accepted_Answers)
pn += 1 print "Request hits page %i" % pn temp_hits = mtc.get_reviewable_hits(page_size=page_size, page_number=pn) hits.extend(temp_hits) return hits mtc = MTurkConnection(aws_access_key_id="EXAMPLE", aws_secret_access_key="EXAMPLE", host='mechanicalturk.amazonaws.com') print "Reviewable:" for hit in get_all_reviewable_hits(mtc): print hit print "HITs:" for hit in mtc.get_all_hits(): print "\tHitID:", hit.HITId print "\tAssignments:" for assignment in mtc.get_assignments(hit.HITId): print "\t\tWorker ID:", assignment.WorkerId print "\t\tAssignment ID:", assignment.AssignmentId #print "\t\tSubmit URL:", "https://www.mturk.com/mturk/externalSubmit?assignmentId=%s&hitId=%s&workerId=%s" % (assignment.AssignmentId, hit.HITId, assignment.WorkerId) #import urllib2, urllib #values = {'assignmentId':assignment.AssignmentId, 'hitId':hit.HITId, 'workerId':assignment.WorkerId} #req = urllib2.Request("https://www.mturk.com/mturk/externalSubmit", urllib.urlencode( values )) #response = urllib2.urlopen(req) #result = response.read() #print result
#"--free" in sys.argv else 0.10 try: #if "-e" in sys.argv or "--extend" in sys.argv if NEWHIT.intersection(argset): mtc.create_hit(question=question, title=TITLE, description=DESCRIPTION, keywords='Photography', duration=3600, reward=payment, qualifications=qualifications, max_assignments=assignments, approval_delay=0) else: DynamoHIT = list(mtc.get_all_hits()) if len(DynamoHIT) != 1: print "Can't identify 1 clear HIT to extend, breaking." if len(DynamoHIT) == 0: print "You seem to have no HITs. Please use the -n or --new parameters to make one." sys.exit(1) else: DynamoHIT = DynamoHIT[0] print "extending HIT {} by {} assignments"\ .format(DynamoHIT.HITId,assignments) mtc.extend_hit(DynamoHIT.HITId, assignments=assignments) except MTurkRequestError as e: print 'request failed' print e.body else: print 'request successful'
def main(argv): if (len(argv) < 2): print "Usage: tweetbeats.py <song_title> <instrument_number> <optional_topic>" else: user_topic = "" # check for command line argument if len(argv) > 2: user_topic = argv[2] ''' ' Gather Tweets ''' print "Gathering Tweets..." tc = TweetCollector() results = tc.CollectTweets(user_topic) print "Topic: " + results[0] ''' ' Create Hits ''' print "Creating HITs..." mtur = MTurk(ACCESS_ID, SECRET_KEY,HOST) for result in results[1]: res = filter(lambda x: x in string.printable, result) new_id = mtur.createHit(res) mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = get_all_reviewable_hits(mtc) while (len(hits) < MIN_TWEETS): print "Not enough hits. Will try again in 10 seconds...." sleep(10) hits = get_all_reviewable_hits(mtc) hits3 = [] for hit in hits: assignments = mtc.get_assignments(hit.HITId) for assignment in assignments: print "Answers of the worker %s" % assignment.WorkerId answers = [] for question_form_answer in assignment.answers[0]: for value in question_form_answer.fields: answers.append(int(value)) print "Responses : ", answers hits3.append(answers) mtc.approve_assignment(assignment.AssignmentId) print "--------------------" mtc.disable_hit(hit.HITId) #Remove unused HITS; make 5 passes to clean up as best we can print "Removing unused HITs... Pass #1 of 5" hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #2 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #3 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #4 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) print "Removing unused HITs... Pass #5 of 5" sleep(20) hits = mtc.get_all_hits() for hit in hits: mtc.disable_hit(hit.HITId) ''' ' Make Hits into Music ''' initializeTrack(argv[1]) time = 1 for result in hits3: duration = 0 durationResult = result[1] if durationResult == 1: duration = .375 #dotted sixteenth elif durationResult == 2: duration = .5 #eighth elif durationResult == 3: duration = .75 #dotted eigth elif durationResult == 4: duration = 1 #quarter elif durationResult == 5: duration = 1.5 #dotted quarter elif durationResult == 6: duration = 2 #half elif durationResult == 7: duration = 3 #dotted half elif durationResult == 8: duration = 4 #whole shift = random.choice([-11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) chord = result[0] if chord == 1: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, -1) #C maj Joy elif chord == 2: addChord(time, duration, 100, 60 + shift, 63 + shift, 67 + shift, 70 + shift) #C min9 Sadness elif chord == 3: addChord(time, duration, 100, 60 + shift, 64 + shift, 66 + shift, 69 + shift) #C dim7 Anger elif chord == 4: addChord(time, duration, 100, 60 + shift, 64 + shift, 66 + shift, -1) #C flat5 Fear elif chord == 5: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, 69 + shift) #C maj6 Trust elif chord == 6: addChord(time, duration, 100, 60 + shift, 63 + shift, 67 + shift, 69 + shift) #C m6 Distrust elif chord == 7: addChord(time, duration, 100, 60 + shift, 63 + shift, 66 + shift, 70 + shift) #C m7b5 Surprise elif chord == 8: addChord(time, duration, 100, 60 + shift, 64 + shift, 67 + shift, 71 + shift) #C maj7 Anticipation time += duration addChord(time, 4, 000, 60, 60, 60, 60) #silence to allow last note to fade out closeTrack(argv[0]) music_file = argv[0] + ".mid" # set up the mixer freq = 44100 # audio CD quality bitsize = -16 # unsigned 16 bit channels = 2 # 1 is mono, 2 is stereo buffer = 2048 # number of samples pygame.mixer.init(freq, bitsize, channels, buffer) # optional volume 0 to 1.0 pygame.mixer.music.set_volume(1.0) pygame.mixer.music.load(music_file) print "Music file %s loaded!" % music_file clock = pygame.time.Clock() pygame.mixer.music.play() while pygame.mixer.music.get_busy(): # check if playback has finished clock.tick(30)
class TurkerResults: # TODO: this should be moved to DB def get_tasklet_from_hit(self, hitid): self.cur.execute(""" select task_id from tasklet_session_log where assignment_id not like 'picked_%%' and worker_id not like 'internal_%%' and hit_id = %s; """,[hitid]) return self.cur.fetchall() def get_tasklet_kw(self, tid): self.cur.execute(""" select keywords from tasklets where tasklets.id = %s """, [tid]) return self.cur.fetchone()[0] def __init__(self): self.config = HaCRSUtil.get_config('../config.ini') HOST = self.config.get('mturk','host') AWS_ACCESS_KEY_ID = self.config.get('mturk', 'access_key_id') AWS_SECRET_ACCESS_KEY = self.config.get('mturk', 'secret_access_key') self.MTconnection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=HOST) self.db = HaCRSDB() self.con, self.cur = HaCRSUtil.get_db(self.config) self.mt = HaCRSTurker() def assignment_payout(self, assignments, amount): paysum = 0 for assignment in assignments: if assignment.AssignmentStatus == 'Approved': paysum += amount else: pdb.set_trace() pass return paysum def get_paid_bonus(self, bonuses, assignmentid, workerid): for bonus in bonuses: if bonus['aid'] == assignmentid and bonus['wid'] == workerid : return bonus['price'] return 0 def get_all_hits(self): all_hits = [hit for hit in self.MTconnection.get_all_hits()] totassignments = 0 maxtotalspent = 0 for hit in all_hits: assignments = self.MTconnection.get_assignments(hit.HITId) assignmentpay = self.assignment_payout(assignments, float(hit.Amount)) maxtotalspent += assignmentpay #print "{} - {} - {} - Expired: {} Keywords: {} #Assignments: {}".format(hit.CreationTime, hit.HITStatus, hit.HITReviewStatus, hit.expired, hit.Keywords, len(assignments)) totassignments += len(assignments) #print '' #print "Total: #HIT: {}, #Assignments: {} TotalMaxSpent: {}".format(len(all_hits), totassignments, maxtotalspent) def log_worker(self, worker_base, worker_bonus, worker_solves, difficulty_solves, assignment, bonuses, hit): wid = assignment.WorkerId if wid not in worker_base.keys(): worker_base[wid] = 0.0 worker_solves[wid] = {} wbonus = self.get_paid_bonus(bonuses, assignment.AssignmentId, wid) worker_base[wid] += float(hit.Amount) if wbonus > 0: if wid not in worker_bonus.keys(): worker_bonus[wid] = 0.0 worker_bonus[wid] += wbonus if hit.Keywords not in worker_solves[wid].keys(): worker_solves[wid][hit.Keywords] = 0 worker_solves[wid][hit.Keywords] += 1 if hit.Keywords not in difficulty_solves.keys(): difficulty_solves[hit.Keywords] = 0 difficulty_solves[hit.Keywords] += 1 def get_all_spendings_by_worker(self): all_hits = [hit for hit in self.MTconnection.get_all_hits()] maxtotalspent = 0 bonuses = json.load(open('bonus_paid.json')) worker_base = {} worker_bonus = {} worker_solves = {} difficulty_solves = {} for hit in all_hits: assignments = self.MTconnection.get_assignments(hit.HITId) for assignment in assignments: if assignment.AssignmentStatus == 'Approved': self.log_worker(worker_base, worker_bonus, worker_solves, difficulty_solves, assignment, bonuses, hit) #print "Worker Base:" #print json.dumps(worker_base, sort_keys=True, indent=4, separators=(',', ': ') ) #print "Worker Bonus:" #print json.dumps(worker_bonus, sort_keys=True, indent=4, separators=(',', ': ') ) #print "Worker Solves:" #print json.dumps(worker_solves, sort_keys=True, indent=4, separators=(',', ': ') ) #print "By Difficulty:" #print json.dumps(difficulty_solves, sort_keys=True, indent=4, separators=(',', ': ') ) return def test_seek_tasklet(self, tid, program ): fseek = '{}/{}/{}-seek.json'.format(self.config.get('general', 'resultsfolder'), program, tid) if os.path.exists(fseek): return json.load(open(fseek))['triggered'] else: return None def approve_reject(self, taskid_earnings): global EXPERIMENT_START all_hits = [hit for hit in self.MTconnection.get_all_hits()] worker_solvecount = {} worker_solvedifficulty = {} solved = 0 tasklet_hit_done = set() empty = { 'easy': 0, 'medium': 0, 'hard': 0, 'very_hard': 0, 'priority': 0 } for hit in all_hits: if not hit.CreationTime.startswith(EXPERIMENT_START): #print 'old hit!' continue if hit.NumberOfAssignmentsCompleted == 0: continue tasklet_ids = self.get_tasklet_from_hit(hit.HITId) if len(tasklet_ids) == 0: continue for line in tasklet_ids: tid = str(line[0]) tasklet = self.db.get_full_tasklet(tid) assignments = self.MTconnection.get_assignments(hit.HITId) for assignment in assignments: if assignment.WorkerId not in worker_solvecount.keys(): worker_solvecount[assignment.WorkerId] = 0 worker_solvedifficulty[assignment.WorkerId] = copy.deepcopy(empty) if tasklet['type'] != 'SEED': assert False, 'Wrong tasklet type!' #print 'hit: {} {}'.format(hit.HITStatus, hit.HITReviewStatus) if assignment.AssignmentStatus == 'Approved': tkey = "{}/{}/{}".format(hit.HITId, assignment.AssignmentId, assignment.WorkerId) if not tkey in tasklet_hit_done: worker_solvedifficulty[assignment.WorkerId][tasklet['keywords']] += 1 tasklet_hit_done.add(tkey) #print 'Approved: {}'.format(tkey) if hit.HITReviewStatus == 'NotReviewed': if not assignment.AssignmentStatus == 'Submitted': solved += 1 continue try: money = taskid_earnings[tid][assignment.WorkerId] except Exception as e: #print '{} error'.format(tasklet['type']) continue if money['payout'] < money['amount']: pass #print 'Possible reject: {}'.format(money['payout']) # TODO - uncomment this to actually reject a task #self.MTconnection.reject_assignment(assignment.AssignmentId) if money['payout'] >= money['amount']: self.MTconnection.approve_assignment(assignment.AssignmentId, feedback = "Thanks for participating, more similar tasks coming soon") pass else: #print 'else: {}'.format(hit.HITReviewStatus) pass worker_solvecount[assignment.WorkerId]+= float(hit.Amount) #pprint (sorted(worker_solvecount.items(), key=operator.itemgetter(1))) #print "worker_solvecount" #print json.dumps(worker_solvecount, sort_keys=True, indent=4, separators=(',', ': ') ) #print "worker_solvedifficulty" #print json.dumps(worker_solvedifficulty, sort_keys=True, indent=4, separators=(',', ': ') ) #print "Solved: {}, total payout: {}".format(solved, sum(worker_solvecount.values())) def split_composite_key(self, k): # "{}-{}-{}-{}".format(taskid, hitid, assignmentid, workerid) tid = k[:36] hitid, aid, workerid = k[37:].split('-') assert len(hitid) == 30 assert len(aid) == 30 assert len(workerid) in [11, 12, 13, 14] return {'tid': tid, 'hitid': hitid, 'aid': aid, 'workerid': workerid} def get_seed_stats(self, seed_taskletid_solved): global EXPERIMENT_START unique_seed_workers = set() worker_payouts_base = {} worker_payouts_bonus = {} worker_payouts_combined = {} worker_solves = {} tasklet_solved = set() total_payout_base = 0 total_payout_bonus = 0 tasklet_difficulty = {} program_solves = {} for program in json.load(open(self.config.get('general', 'programsjson'))): prog_maxcoverage[program] = 0 if program not in program_solves: program_solves[program] = 0 program = None total_payout = 0 for tasklet in self.db.get_seed_tasklets(): if not str(tasklet['timestamp']).startswith(EXPERIMENT_START): continue if tasklet['program'] in ['seed_training', 'A_Game_of_Chance']: continue if tasklet['id'] in seed_taskletid_solved.keys(): program_solves[tasklet['program']] += 1 for jfile in glob.glob('{}/{}/{}*/*.json'.format(self.config.get('general', 'resultsfolder'), tasklet['program'], str(tasklet['id']))): try: metadata = self.split_composite_key(jfile.split(os.path.sep)[jfile.split(os.path.sep).index('result.json')-1]) # fake keys except Exception as e: continue # that's us if metadata['workerid'] == 'A2PRAI0ABXN99X': continue results = json.loads(open(jfile).readlines()[-1]) tasklet = self.db.get_full_tasklet(metadata['tid']) payout = HaCRSUtil.get_current_payout(tasklet['payout_arr'], results['new_transitions']) prog_maxcoverage[tasklet['program']] = max(prog_maxcoverage[tasklet['program']], results['coverage']) if payout > 0: if metadata['workerid'] not in worker_solves.keys(): worker_solves[metadata['workerid']] = 0 worker_solves[metadata['workerid']] += 1 unique_seed_workers.add(metadata['workerid']) if tasklet['keywords'] not in tasklet_difficulty: tasklet_difficulty[tasklet['keywords']] = 0 tasklet_difficulty[tasklet['keywords']] += 1 tasklet_solved.add(tasklet['id']) if metadata['workerid'] not in worker_payouts_base.keys(): worker_payouts_base[metadata['workerid']] = [] worker_payouts_bonus[metadata['workerid']] = [] worker_payouts_combined[metadata['workerid']] = [] worker_payouts_base[metadata['workerid']].append(tasklet['amount']) total_payout_base += tasklet['amount'] if payout > tasklet['amount']: total_payout_bonus += round(payout - tasklet['amount'], 2) worker_payouts_bonus[metadata['workerid']].append(round(payout - tasklet['amount'], 2)) total_payout_bonus += round(payout - tasklet['amount'], 2) worker_payouts_combined[metadata['workerid']].append(round(payout, 2)) total_payout += payout pass #print 'Total seed BASE payment: $ {}'.format(total_payout_base) #print 'Total seed BONUS payment: $ {}'.format(total_payout_bonus) #print 'Workers solving at least one SEED Task: {}'.format(len(unique_seed_workers)) #print 'Number of solved SEED tasks: {}'.format(len(tasklet_solved)) #print 'Number of tasklets by difficulty: {}'.format(tasklet_difficulty) #print 'Busiest worker: {} solves'.format(max(worker_solves.values())) #print 'Average worker throughput: {} solves'.format(round(sum(worker_solves.values()) / float(len(worker_solves.values())), 2)) pdb.set_trace() def get_solve_ratio(self): global EXPERIMENT_START prog_maxcoverage = {} taskid_earnings = {} total_payout_with_bonus = 0 goalreached = 0 goalnotreached = 0 for program in json.load(open(self.config.get('general', 'programsjson'))): prog_maxcoverage[program] = 0 program = None seed_taskletid_solved = {} empty = { 'easy': 0, 'medium': 0, 'hard': 0, 'very_hard': 0, 'priority': 0 } for tasklet in self.db.get_seed_tasklets() + self.db.get_seek_tasklets(): if not str(tasklet['timestamp']).startswith(EXPERIMENT_START): continue for jfile in glob.glob('{}/{}/{}*/*.json'.format(self.config.get('general', 'resultsfolder'), tasklet['program'], str(tasklet['id']))): try: metadata = self.split_composite_key(jfile.split(os.path.sep)[jfile.split(os.path.sep).index('result.json')-1]) # fake keys except Exception as e: #print e continue # that's us if metadata['workerid'] == 'A2PRAI0ABXN99X': continue results = json.loads(open(jfile).readlines()[-1]) tasklet = self.db.get_full_tasklet(metadata['tid']) if tasklet == None: #print "No tasklet for program {}".format(tasklet['program']) continue if tasklet['type'] == 'SEED': payout = HaCRSUtil.get_current_payout(tasklet['payout_arr'], results['new_transitions']) prog_maxcoverage[tasklet['program']] = max(prog_maxcoverage[tasklet['program']], results['coverage']) elif tasklet['type'] == 'SEEK': payout = tasklet['amount'] elif tasklet['type'] == 'DRILL': payout = tasklet['amount'] total_payout_with_bonus += payout # over-achieved if tasklet['amount'] <= payout: goalreached += 1 elif tasklet['amount'] > payout: goalnotreached += 1 if tasklet['amount'] <= payout and tasklet['type'] == 'SEED': hitinfos = self.db.get_hit_for_tasklet(tasklet['id']) for hit in hitinfos: assignment = self.mt.get_assignment_from_hit(hit) if assignment and assignment.WorkerId in ['A10O5YR01H865K', 'A1HRHFU7KTS0KW', 'A1PUHCEBSOWETV']: pass seed_taskletid_solved[tasklet['id']] = True if metadata['tid'] not in taskid_earnings.keys(): taskid_earnings[metadata['tid']] = {} if metadata['workerid'] not in taskid_earnings[metadata['tid']].keys(): taskid_earnings[metadata['tid']][metadata['workerid']] = {} taskid_earnings[metadata['tid']][metadata['workerid']] = {'payout': payout, 'amount': tasklet['amount']} #print "Goal reached: {}, Goal not reached: {}". format(goalreached, goalnotreached) #print "taskid_earnings" pprint(taskid_earnings) return taskid_earnings, prog_maxcoverage, seed_taskletid_solved def log_bonus(self, tid, wid, aid, price): self.bonuses.append({'tid': tid, 'wid': wid, 'aid': aid, 'price': price}) json.dump(self.bonuses, open('bonus_paid.json', 'w'), sort_keys=True, indent=4, separators=(',', ': ') ) def do_pay_bonus(self, tid, wid, aid, price): assert len(tid) > 5, 'tasklet id mismatch' reason = "We issued a bonus for reaching a stretch goal of our task - Thanks!" assert price < 5 self.log_bonus(tid, wid, aid, price) try: self.MTconnection.grant_bonus(wid, aid, Price(price), reason) return True except Exception as e: #print "Not issued for whatever reason: {}".format(e) return False def bonus_paid_before(self, tid, wid, aid): for bonus in self.bonuses: if bonus['tid'] == tid and bonus['aid'] == aid and bonus['wid'] == wid: return True return False def check_bonus(self, taskid_earnings): self.bonuses = json.load(open('bonus_paid.json')) total_bonus_issued = 0 worker_bonus = {} for program in json.load(open(self.config.get('general', 'programsjson'))): for jfile in glob.glob('{}/{}/*/result.json'.format(self.config.get('general', 'resultsfolder'), program)): try: xkey = jfile.split(os.path.sep)[jfile.split(os.path.sep).index('result.json')-1] if xkey.endswith('-OLD') or xkey.endswith('-internal_zardus'): continue metadata = self.split_composite_key(xkey) except Exception as e: #pdb.set_trace() #print 'Skipping {}'.format(e) continue if metadata['workerid'] == 'A2PRAI0ABXN99X': continue results = json.loads(open(jfile).readlines()[-1]) tasklet = self.db.get_full_tasklet(metadata['tid']) try: money = taskid_earnings[str(tasklet['id'])][metadata['workerid']] except Exception as e: continue if tasklet['type'] != 'SEED': #print "We only pay a bonus for SEEDing" continue if money['payout'] > money['amount']: bonus = round(money['payout'] - money['amount'], 2) if metadata['workerid'] not in worker_bonus: worker_bonus[metadata['workerid']] = 0 worker_bonus[metadata['workerid']] += round( worker_bonus[metadata['workerid']] + bonus, 2) if self.bonus_paid_before(str(tasklet['id']), metadata['workerid'], metadata['aid']): #print 'paid before - skip' pass else: pass #print 'Bonus payout: {}'.format(bonus) # TODO... #if self.do_pay_bonus(str(tasklet['id']), metadata['workerid'], metadata['aid'], bonus): # total_bonus_issued += bonus #print 'worker_bonus' #print json.dumps(worker_bonus, sort_keys=True, indent=4, separators=(',', ': ') ) #print "Issued {} in bonuses".format(total_bonus_issued) def show_medium_hard(self, taskid_earnings): for tasklet in taskid_earnings: #tasklet = self.db.get_full_tasklet(tasklet) kw = self.get_tasklet_kw(tasklet) kwkey = ['easy', 'medium', 'hard', 'very_hard', 'priority'].index(kw) if len(filter(lambda x: x['payout'] >= x['amount'], taskid_earnings[tasklet].values())) > 0: pass #print '{} [{}] {}: {}'.format(self.db.get_tasklet_program(tasklet), kwkey, kw, tasklet) sys.exit(1) def approve_single(self, hitid): xhit = self.MTconnection.get_hit(hitid) assignments = self.MTconnection.get_assignments(xhit[0].HITId) #print 'Verify data before proceeding' pdb.set_trace() rc = self.MTconnection.approve_rejected_assignment(assignments[0].AssignmentId, feedback = "Thanks for participating, more similar tasks coming soon") def trace_hit(self, hitid): xhit = self.MTconnection.get_hit(hitid) pdb.set_trace() pass
feedback="Not the right code or you have already done the same task!") print 'rejected', workerId config = ConfigParser.ConfigParser() config.read("./keys.ignore") ACCESS_ID = config.get('keys', 'ACCESS_ID') SECRET_KEY = config.get('keys', 'SECRET_KEY') price = Price(0.01) HOST = 'mechanicalturk.amazonaws.com' mturk = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = mturk.get_all_hits() db = MySQLdb.connect("localhost", "erik", "erik", db_name) cursor = db.cursor() for hit in hits: if hit.HITStatus != 'Assignable': continue assignments = mturk.get_assignments(hit.HITId, status="Submitted", page_size=100) #assignments = mturk.get_assignments(hit.HITId, page_size=100) for assignment in assignments: answers = assignment.answers[0] code = "-1" for answer in answers:
#from analysis_toolbox import * import ast from boto.mturk.connection import MTurkRequestError from boto.mturk.connection import MTurkConnection import datetime from secret import SECRET_KEY,ACCESS_KEY,AMAZON_HOST #Start Configuration Variables AWS_ACCESS_KEY_ID = ACCESS_KEY AWS_SECRET_ACCESS_KEY = SECRET_KEY connection = MTurkConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, host=AMAZON_HOST) print 'Connected to AMT' all_hits = [hit for hit in connection.get_all_hits()]#connection.get_reviewable_hits()]#connection.get_all_hits()] #connection.get_reviewable_hits for hit in all_hits: assignments = connection.get_assignments(hit.HITId) print assignments for assignment in assignments: print "Working on ", assignment try: connection.approve_assignment(assignment.AssignmentId) print 'approved ', assignment.AssignmentId except MTurkRequestError: #Problably already approved or rejected this assignment previously print "already approved/rejected" pass
def go(): options = parseCommandLine() ACCESS_ID = options.access_id SECRET_KEY = options.secret_key if ACCESS_ID == None or SECRET_KEY == None: print("missing AWS credentials") sys.exit(1) HOST = 'mechanicalturk.amazonaws.com' mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) results_dir = "./results" magic_extension_map = { 'JPEG': '.jpeg', 'PNG': '.png' } hit_count = counter() assignment_count = counter() accept_count = counter() reject_count = counter() for hit in mtc.get_all_hits(): hit_count.next() title = hit.Title.lower() tokens = title.split() original_name = None if tokens[-1].endswith('.jpg') or tokens[-1].endswith('.png'): (basename, ext) = os.path.splitext(tokens[-1]) else: print("Skipping HIT: " + hit.Title) continue output_dir = os.path.join(results_dir, basename) if not os.path.exists(output_dir): os.makedirs(output_dir) for assignment in mtc.get_assignments(hit.HITId): if options.skip_approved and assignment.AssignmentStatus == 'Approved': continue if options.skip_rejected and assignment.AssignmentStatus == 'Rejected': continue print("Assignment Status %s" % assignment.AssignmentStatus) assignment_count.next() assignment_filename = assignment.AssignmentId output_filename = os.path.join(output_dir, assignment_filename) url = get_file_upload_url_only(mtc, assignment.AssignmentId) if not url: reject_count.next() if options.reject: print(" Rejecting " + assignment.AssignmentId) mtc.reject_assignment( assignment.AssignmentId, "We require a downloadable file as a result per the instructions. No file found in your submission.") else: print( " No downloadable file found. Use --reject to reject " + assignment.AssignmentId) else: if options.download: bytes_written = curl_url_to_output_file( url, output_filename) magic_info = magic.from_file(output_filename) magic_type = magic_info.split()[0] add_extension = magic_extension_map.get(magic_type, '.dat') # If we don't get .png, .jpeg, we really can't use the files. print("Processing assignment: " + assignment.AssignmentId) if add_extension == '.dat': reject_count.next() if options.reject: print(" Rejecting " + assignment.AssignmentId) mtc.reject_assignment( assignment.AssignmentId, "We require a .png file as a result per the instructions. You submitted " + magic_type) else: print(" Use --reject to reject " + assignment.AssignmentId) else: accept_count.next() if options.accept: print(" Accepting " + assignment.AssignmentId) mtc.approve_assignment(assignment.AssignmentId) else: print(" Use --accept to accept " + assignment.AssignmentId) os.rename(output_filename, output_filename + add_extension) else: print(" Use --download to fetch " + url) print("Total hits = %d; assignments = %d; accept = %d; reject = %d" % ( hit_count.next(), assignment_count.next(), accept_count.next(), reject_count.next()))
from boto.mturk.connection import MTurkConnection ACCESS_ID ="AKIAJ45QKE6AXRXSOD5A" SECRET_KEY = "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" HOST = 'mechanicalturk.sandbox.amazonaws.com' mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST) hits = mtc.get_all_hits() per_pangram = 'Mr Jock TV quiz PhD bags few lynx' def answer_key(user_input): posbl_percent_1=0 posbl_percent_2=0 if len(per_pangram) >= len(user_input): diff = len(per_pangram)- len(user_input) #print diff #print len(per_pangram) #print len(user_input) for x in xrange(0, len(user_input)): #print "1user: %s,per_pan %s" % (user_input[x], per_pangram[x]) if user_input[x] == per_pangram[x]: posbl_percent_1 = posbl_percent_1 + 1 #print "2user: %s,per_pan %s" % (user_input[x], per_pangram[x+diff]) if user_input[x] == per_pangram[x+diff]:
class TranscriptionPipelineHandler(): def __init__(self): aws_id = os.environ['AWS_ACCESS_KEY_ID'] aws_k = os.environ['AWS_ACCESS_KEY'] self.conn = MTurkConnection(aws_access_key_id=aws_id,\ aws_secret_access_key=aws_k,\ host=HOST) self.ah = AssignmentHandler(self.conn) self.th = TurkerHandler(self.conn) self.hh = HitHandler(self.conn,TEMPLATE_DIR) self.mh = MongoTranscriptionHandler() self.wh = WavHandler() self.ph = PromptHandler() self.filter = Filter(self.mh) self.balance = self.conn.get_account_balance()[0].amount self.logger = logging.getLogger("transcription_engine.transcription_pipeline_handler") def audio_clip_referenced_to_hit(self,priority=1,max_queue_size=10): for audio_clip in self.mh.get_artifacts_by_state("audio_clips","Referenced"): audio_clip_id = audio_clip["_id"] self.mh.queue_clip(audio_clip_id, priority, max_queue_size) response = self.audio_clip_queue_to_hit() def audio_clip_queued_to_hit(self,priority=1,max_queue_size=10): for audio_clip in self.mh.get_artifacts("audio_clips",{"state":"Queued"}): audio_clip_id = audio_clip["_id"] response = self.audio_clip_queue_to_hit() #=================================================================== # elif state == "Hit": # print("In hit: %s"%audio_clip_url) #=================================================================== def audio_clip_queue_to_hit(self,cost_sensitive=True): """Take queued audio clips from the audio clip queue put them in a hit and create the hit. If successful, update the audio clip state.""" clip_queue = self.mh.get_audio_clip_queue() clip_pairs = self.mh.get_audio_clip_pairs(clip_queue) if clip_pairs: hit_title = "Audio Transcription" question_title = "List and Transcribe" description = "Transcribe the audio clip by typing the words the person says in order." keywords = "audio, transcription, audio transcription" if cost_sensitive: reward_per_clip = 0.02 max_assignments = 3 estimated_cost = self.hh.estimate_html_HIT_cost(clip_pairs,reward_per_clip,max_assignments) clips_in_hits = self.mh.clips_already_in_hit(clip_pairs) if clips_in_hits: #If one or more clips are already in a HIT, remove it from the queue self.mh.remove_audio_clips_from_queue(clips_in_hits) elif self.balance - estimated_cost >= 250: #if we have enough money, create the HIT response = self.hh.make_html_transcription_HIT(clip_pairs,hit_title, question_title, description, keywords) self.balance = self.balance - estimated_cost if type(response) == ResultSet and len(response) == 1 and response[0].IsValid: response = response[0] self.mh.remove_audio_clips_from_queue(clip_queue) audio_clip_ids = [w["audio_clip_id"] for w in clip_queue] hit_id = response.HITId hit_type_id = response.HITTypeId self.mh.create_transcription_hit_artifact(hit_id,hit_type_id,clip_queue,"New") self.logger.info("Successfully created HIT: %s"%hit_id) return self.mh.update_audio_clips_state(audio_clip_ids,"Hit") else: pass return False def load_assignments_hit_to_submitted(self): """Check all assignments for audio clip IDs. Update the audio clips. This is a non-destructive load of the assignments from MTurk""" hits = self.conn.get_all_hits() for hit in hits: transcription_dicts = [{}] hit_id = hit.HITId assignments = self.conn.get_assignments(hit_id) have_all_assignments = True assignment_ids = [] for assignment in assignments: assignment_ids.append(assignment.AssignmentId) if self.mh.get_artifact("assignments",{"_id":assignment.AssignmentId}): #We create assignments here, so if we already have it, skip continue else: have_all_assignments = False transcription_ids = [] transcription_dicts = self.ah.get_assignment_submitted_transcriptions(assignment) if transcription_dicts and len(transcription_dicts)==10: pass for transcription in transcription_dicts: if not self.mh.get_artifact_by_id("audio_clips",transcription["audio_clip_id"]): self.logger.info("Assignment(%s) with unknown audio clip(%s) skipped"%\ (assignment.AssignmentId,transcription["audio_clip_id"])) break self.mh.update_transcription_state(transcription,"Submitted") self.mh.update_audio_clips_state([transcription["audio_clip_id"]], "Submitted") transcription_ids.append(self.mh.get_artifact("transcriptions",{"audio_clip_id" : transcription["audio_clip_id"], "assignment_id" : transcription["assignment_id"]}, "_id")) else: self.mh.create_assignment_artifact(assignment, transcription_ids, "Submitted") if assignments and not have_all_assignments: self.mh.update_transcription_hit_state(hit_id,"Submitted") print("Transcriptions HIT(%s) submitted assignments: %s "%(hit_id,assignment_ids)) def assignment_submitted_approved(self): """For all submitted assignments, if an answered question has a reference transcription, check the WER. If all the answered questions with reference transcriptions have an acceptable WER, approve the assignment and update the audio clips and transcriptions.""" assignments = self.mh.get_artifacts_by_state("assignments", "Submitted") rejected_feedback = "I'm sorry but your work in assignment(%s) was rejected because" +\ " one or more of your transcriptions " +\ " had a word error rate above the maximum acceptable"+\ " word error rate of %s. Omitted words and words that "+\ " differed by more than %s "+\ " characters were counted as an error." accepted_feedback = "Your average word error rate on assignment(%s) was %s."+\ " Assignment accepted! Thanks for your hard work." for assignment in assignments: assignment_id = assignment["_id"] transcription_ids = assignment["transcriptions"] transcriptions = self.mh.get_artifacts("transcriptions","_id",transcription_ids) worker_id = assignment["worker_id"] worker_id = self.mh.create_worker_artifact(worker_id) approved, average_wer = self.filter.approve_assignment(transcriptions) if approved: try: self.conn.approve_assignment(assignment_id, accepted_feedback%(assignment_id,average_wer)) except MTurkRequestError as e: print(e) else: self.mh.update_assignment_state(assignment,"Approved") for transcription in transcriptions: #Approve transcriptions without references in the same assignment reference_id = self.mh.get_artifact_by_id("audio_clips",transcription["audio_clip_id"],"reference_transcription_id") if not reference_id: self.mh.update_transcription_state(transcription,"Approved") print("Approved transcription ids: %s"%transcription_ids) else: #Don't deny for now feedback = rejected_feedback%(assignment_id,self.filter.WER_THRESHOLD,self.filter.CER_THRESHOLD) self.logger.info(feedback) self.conn.reject_assignment(assignment_id,feedback) self.mh.update_assignment_state(assignment,"Denied") #print("Assignments not aproved %s "%denied) #Update the worker if approved: self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer)) def _load_rm_audio_source_file_to_clipped(self,file_dir,prompt_file_uri, base_clip_dir,sample_rate=16000, http_base_url = "http://www.cis.upenn.edu/~tturpen/wavs/", init_clip_count = 200): """For an audio directory, see which files are new and not an audio source already """ prompt_dict = self.ph.get_prompts(prompt_file_uri) count = 0 for root, dirs, files in os.walk(file_dir): for f in files: if count == init_clip_count: return system_uri = os.path.join(root,f) out_uri = system_uri.strip(".sph") + ".wav" out_uri = os.path.basename(out_uri) out_uri = os.path.join(root,(out_uri)) spkr_id = str(os.path.relpath(root,file_dir)) #sph to wav if not f.endswith(".wav") and not os.path.exists(out_uri): try: self.wh.sph_to_wav(system_uri,out_uri=out_uri) except WavHandlerException as e: self.logger.error("Unable to create wav from sph: "+str(e)) if os.path.exists(out_uri) and out_uri.endswith(".wav"): #create audio source artifact count += 1 wav_filename = os.path.basename(out_uri) prompt_id = os.path.basename(out_uri).strip(".wav").upper() encoding = ".wav" sample_rate = 16000 disk_space = os.stat(out_uri).st_size length_seconds = self.wh.get_audio_length(out_uri) if prompt_id in prompt_dict: transcription_prompt = prompt_dict[prompt_id] else: #No prompt found raise PromptNotFound source_id = self.mh.create_audio_source_artifact(out_uri, disk_space, length_seconds, sample_rate, spkr_id, encoding) #create audio clip artifact audio_clip_uri = os.path.join(base_clip_dir,spkr_id,wav_filename) clip_dir = os.path.dirname(audio_clip_uri) if not os.path.exists(clip_dir): os.makedirs(clip_dir) if not os.path.exists(audio_clip_uri): copyfile(out_uri,audio_clip_uri) #http_url http_url = os.path.join(http_base_url,spkr_id,wav_filename) clip_id = self.mh.create_audio_clip_artifact(source_id, 0, -1, audio_clip_uri, http_url, length_seconds, disk_space) #Update the audio source, updates state too self.mh.update_audio_source_audio_clip(source_id,clip_id) #Create the reference transcription artifact transcription_id = self.mh.create_reference_transcription_artifact(clip_id, transcription_prompt, "Gold") #Completes audio clip to Referenced self.mh.update_audio_clip_reference_transcription(clip_id,transcription_id) def all_workers_liveness(self): workers = self.mh.get_all_workers() for worker in workers: worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments(worker) print("Worker(%s) assignments, approved(%s) denied(%s)"%(worker["_id"],approved,denied)) selection = input("1. Show denied transcriptions and references.\n"+ "2. Show accepted transcriptions and references.\n"+ "3. Show both denied and accepted transcriptions.") if selection == 1 or selection == 3: print("Approved transcriptions") for assignment_id in approved: transcription_pairs = self.mh.get_transcription_pairs(assignment_id) for pair in transcription_pairs: print ("Reference:\n\t%s\nHypothesis:\n\t%s\n"%(pair[0],pair[1])) if selection == 2 or selection == 3: print("Denied transcriptions") for assignment_id in denied: transcription_pairs = self.mh.get_transcription_pairs(assignment_id) for pair in transcription_pairs: print ("Reference:\n\t%s\nHypothesis:\n\t%s\n"%(pair[0],pair[1])) def stats(self): workers = self.mh.get_all_workers() all_wer_per_approved_assignment = 0.0 total_accepted = 0.0 for worker in workers: worker_wer = 0.0 worker_id = worker["_id"] approved, denied = self.mh.get_worker_assignments_wer(worker) for w in approved: all_wer_per_approved_assignment += float(w[1]) worker_wer += float(w[1]) total_accepted += 1 if approved: worker_average_wer = worker_wer/len(approved) print("%s,%s"%(len(approved),worker_average_wer)) #print("Worker(%s) approved assignments(%s)\n denied assignments(%s)"%(worker_id,approved,denied)) av = all_wer_per_approved_assignment/total_accepted print("Average WER per assignment(%s)"%(av)) def get_assignment_stats(self): self.effective_hourly_wage_for_approved_assignments(.20) def effective_hourly_wage_for_approved_assignments(self,reward_per_assignment): """Calculate the effective hourly wage for Approved Assignments""" approved_assignments = self.mh.get_artifacts_by_state("assignments","Approved") total = datetime.timedelta(0) count = 0 for assignment in approved_assignments: if "SubmitTime" in assignment: accepted = datetime.datetime.strptime(assignment["AcceptTime"],"%Y-%m-%dT%H:%M:%SZ") submitted = datetime.datetime.strptime(assignment["SubmitTime"],"%Y-%m-%dT%H:%M:%SZ") else: pass total += submitted-accepted count += 1 seconds_per_assignment = total.total_seconds()/count effective_hourly_wage = 60.0*60.0/seconds_per_assignment * reward_per_assignment print("Effective completion time(%s) *reward(%s) = %s"%(seconds_per_assignment,reward_per_assignment,effective_hourly_wage)) def allhits_liveness(self): #allassignments = self.conn.get_assignments(hit_id) #first = self.ah.get_submitted_transcriptions(hit_id,str(clipid)) hits = self.conn.get_all_hits() for hit in hits: hit_id = hit.HITId print("HIT ID: %s"%hit_id) assignments = self.conn.get_assignments(hit_id) if len(assignments) == 0: if raw_input("Remove hit with no submitted assignments?(y/n)") == "y": try: self.conn.disable_hit(hit_id) clips = self.mh.get_artifact("transcription_hits",{"_id": hit_id},"clips") self.mh.remove_transcription_hit(hit_id) self.mh.update_audio_clips_state(clips, "Referenced") except MTurkRequestError as e: raise e else: if raw_input("Remove hit with %s submitted assignments?(y/n)"%len(assignments)) == "y": try: self.conn.disable_hit(hit_id) except MTurkRequestError as e: raise e def run(self): audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/ind_trn" #audio_file_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/dep_trn" prompt_file_uri = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/doc/al_sents.snr" base_clip_dir = "/home/taylor/data/corpora/LDC/LDC93S3A/rm_comp/rm1_audio1/rm1/clips" selection = 0 init_clip_count = 10000 while selection != "11": selection = raw_input("""Audio Source file to Audio Clip Approved Pipeline:\n 1: AudioSource-FileToClipped: Initialize Resource Management audio source files to %d queueable(Referenced) clips 2: AudioClip-ReferencedToHit: Queue all referenced audio clips and create a HIT if the queue is full. 3: AudioClip-HitToSubmitted: Check all submitted assignments for Transcriptions. 4: AudioClip-SubmittedToApproved: Check all submitted clips against their reference. 5: Review Current Hits 6: Worker liveness 7: Account balance 8: Worker stats 9: Recalculate worker WER 10: Assignment Stats 11: Exit """%init_clip_count) #selection = "5" if selection == "1": self._load_rm_audio_source_file_to_clipped(audio_file_dir, prompt_file_uri, base_clip_dir,init_clip_count=init_clip_count) elif selection == "2": self.audio_clip_referenced_to_hit() elif selection == "3": self.load_assignments_hit_to_submitted() elif selection == "4": self.assignment_submitted_approved() elif selection == "5": self.allhits_liveness() elif selection == "6": self.all_workers_liveness() elif selection == "7": print("Account balance: %s"%self.balance) elif selection == "8": self.stats() elif selection == "9": self.recalculate_worker_assignment_wer() elif selection == "10": self.get_assignment_stats() # def get_time_submitted_for_assignments(self): # assignments = self.mh.get_all_artifacts("assignments") # for assignment in assignments: # assignment_id = assignment["_id"] # a_assignment = self.conn.get_assignment(assignment_id)[0] # self.mh.update_artifact_by_id("assignments", assignment_id, "SubmitTime", a_assignment.SubmitTime) # def recalculate_worker_assignment_wer(self): # """For all submitted assignments, # if an answered question has a reference transcription, # check the WER. # If all the answered questions with reference transcriptions # have an acceptable WER, approve the assignment and update # the audio clips and transcriptions.""" # assignments = self.mh.get_artifacts("assignments",{"state":"Approved"}) # for assignment in assignments: # assignment_id = assignment["_id"] # denied = [] # #If no transcriptions have references then we automatically approve the HIT # approved = True # transcription_ids = assignment["transcriptions"] # transcriptions = self.mh.get_transcriptions("_id",transcription_ids) # worker_id = assignment["worker_id"] # worker_id = self.mh.create_worker_artifact(worker_id) # # max_rej_wer = (0.0,0.0) # total_wer = 0.0 # for transcription in transcriptions: # #Normalize the transcription # #self.mh.normalize_transcription # reference_id = self.mh.get_audio_clip_by_id(transcription["audio_clip_id"],"reference_transcription_id") # if reference_id: # reference_transcription = self.mh.get_reference_transcription({"_id": reference_id}, # "transcription") # new_transcription = transcription["transcription"].split(" ") # if reference_transcription: # transcription_wer = cer_wer(reference_transcription,new_transcription) # total_wer += transcription_wer # if transcription_wer < WER_THRESHOLD: # self.logger.info("WER for transcription(%s) %d"%(transcription["transcription"],transcription_wer)) # else: # max_rej_wer = (transcription_wer,WER_THRESHOLD) # denied.append((reference_transcription,new_transcription)) # approved = False # average_wer = total_wer/len(transcriptions) # #Update the worker # self.mh.add_assignment_to_worker(worker_id,(assignment_id,average_wer))
print url, title, text tuple_list.append([url, title, ' '.join(text)]) print len(tuple_list) print 'Retrieved', len(tuple_list) print 'Missing: ', len(missing) print missing print 'Sending it to mechanicalturk' mtc = MTurkConnection(aws_access_key_id=ACCESS_ID, aws_secret_access_key=SECRET_KEY, host=HOST, is_secure=True, https_connection_factory=(https_connection_factory, ())) print mtc.get_all_hits() fact = MTurkSurveyFactory() questionForms = fact.buildSurvey(tuple_list) print len(questionForms) missing_forms = [] for questionForm in questionForms: try: fact.submitHITs(mtc=mtc, questionForms=[questionForm]) except: missing_forms.extend(questionForm) print "Unexpected error:", sys.exc_info()[0] print len(missing_forms), ' forms could not be submitted' print missing_forms