def create_training_urls(self): num_youtube_urls = int(self.youtube_proportion / 100 * self.num_urls) num_dailymotion_urls = int(self.dailymotion_proportion / 100 * self.num_urls) num_5min_urls = self.num_urls - num_youtube_urls - num_dailymotion_urls urls = set() urls.update( VideoUrls.get_youtube_urls(self.search_kw, num_youtube_urls)) urls.update(VideoUrls.get_5min_urls(self.search_kw, num_5min_urls)) urls.update( VideoUrls.get_dailymotion_urls(self.search_kw, num_dailymotion_urls)) dynamo = DynamoIngestionStatusClient() download_queue = sqs.get_queue(config.sqs_download_queue_name()) for url in map(parse_url, urls): if dynamo.get(url) is None: item_to_enqueue = { "url": url, "status": "Queued", "download_stage": "Text" } dynamo.put(item_to_enqueue) sqs.write_to_queue(download_queue, item_to_enqueue) VideoTrainingURL(job=self, url=url) self.num_urls = len(urls) session.flush() print 'Ingested %s urls to the download queue' % (str(len(urls)))
def submit_hits(cls, days_since_today=30): """submit hits to Mturk for all images that we have verdict for""" min_date = datetime.utcnow() - timedelta(days=days_since_today) num_hits_submitted = 0 for clf_target, images in cls.results_to_qa(min_date).iteritems(): label_id = clf_target.target_label_id evaluator = ClickableImageEvaluator.query.filter_by( target_label_id=label_id).one() images_to_submit = [] for vid, ts in images: mtb = MTurkImage.query.filter_by( video_id=vid, timestamp=ts, label_id=label_id).first() if not mtb: # post HIT if no MTurkImage images_to_submit.append([vid, ts]) for i in xrange(0, len(images_to_submit), cls.NUM_IMAGES_PER_HIT): images_per_hit = images_to_submit[i:i + cls.NUM_IMAGES_PER_HIT] hit_id = evaluator.create_hit(image_ids=images_per_hit) ih = ImageHit(hit_id=hit_id) num_hits_submitted += 1 session.flush() for vid, ts in images_per_hit: mti = MTurkImage( video_id=vid, timestamp=ts, image_hit_id=ih.id, label_id=label_id) cls(clf_target_id=clf_target.id, mturk_image=mti) session.flush() return ImageHit, num_hits_submitted
def get_or_create(cls, target_label): existing = cls.query.filter_by(target_label=target_label).first() if existing: return existing evaluator = cls(name=target_label.name, target_label=target_label) session.flush() return evaluator
def get_or_create(cls, remote_id): remote_id = parse_url(remote_id) rm_id_sha1 = sha1(remote_id).hexdigest() page = (cls.query.filter_by(remote_id_sha1 = rm_id_sha1).first() or cls(remote_id = remote_id, remote_id_sha1 = rm_id_sha1)) session.flush() return page
def submit_hits(cls, days=30): """Submit video detector results to MTurk for QA""" timeframe = timedelta(days) min_date = datetime.now() - timeframe evaluators = {} num_hits_submitted = 0 for video_id, clf_target in cls.results_to_qa(min_date): if clf_target.id not in evaluators: evaluators[clf_target.id] = VideoCollageEvaluator.query.\ filter_by(target_label_id=clf_target.target_label_id).one() evaluator = evaluators[clf_target.id] label_id = evaluator.target_label_id vh = VideoHit.query.filter_by(video_id=video_id, label_id=label_id).first() if not vh: hit_id = evaluator.create_hit(video_id=video_id) vh = VideoHit( hit_id=hit_id, label_id=label_id, video_id=video_id) num_hits_submitted += 1 session.flush() if not cls.query.filter_by(video_id=video_id, clf_target_id=clf_target.id).count(): cls(video_id=video_id, clf_target_id=clf_target.id, hit_id=vh.hit_id, expected_result=True) session.flush() return VideoHit, num_hits_submitted
def update_status(cls): """Ingest new data from MTurk and write it to the database.""" for job in cls.query.filter(cls.finished == False): num_hits_left = session.query(BoxHit).filter_by( training_job_id=job.id, outstanding=True).count() urls_left = session.query(VideoTrainingURL).filter_by( training_job_id=job.id, processed=False) dynamo = DynamoIngestionStatusClient() num_urls_left = 0 for url in urls_left: dynamo_url = dynamo.get(url.url) if dynamo_url is None or dynamo_url['status'] == 'Failed': # will never be processed, so ignore for our purposes url.processed = True else: num_urls_left += 1 if num_hits_left + num_urls_left == 0: job.finished = True print '*** Job ID: %s is complete ***' % str(job.id) print '------------- Stats for Job ID: %s -------------' % str( job.id) print 'Total URLs : %i' % VideoTrainingURL.query.filter_by( training_job_id=job.id).count() print 'Total HITs : %i' % BoxHit.query.filter_by( training_job_id=job.id).count() if not job.finished: print 'unprocessed URLs: %i' % num_urls_left print 'outstanding HITs: %i\n' % num_hits_left session.flush()
def submit_golden_hits(n_hits, n_lookback): """Submit golden hits. Fetches the N_LOOKBACK hits most recently selected for golden submission and submits N_HITS of them, cycling through them as necessary, and prioritizing those that have been submitted as golden the least number of times. Args: n_hits: Number of golden hits submissions. n_lookback: Number of distinct hits used for submission. Raises: AssertionError: No candidate golden hits """ query = session.query(GoldenHitCandidate.hit_id) assert query.count() > 0, "No candidate golden hits" query = query.order_by( GoldenHitCandidate.created_at.desc()).limit(n_lookback) query = query.from_self() query = query.outerjoin(GoldenHit, GoldenHitCandidate.hit_id == GoldenHit.hit_id) query = query.group_by(GoldenHitCandidate.hit_id) query = query.order_by(func.count( GoldenHit.hit_id).asc()).limit(n_hits) hit_ids = islice(cycle([hit_id for (hit_id, ) in query]), n_hits) for hit in map(get_hit_from_hit_id, hit_ids): ghid = MechanicalTurkEvaluator.create_duplicate_hit(hit) GoldenHit(golden_hit_id=ghid, hit_id=hit.hit_id) session.flush()
def enable_qa(cls, label, collage_count, **kw): """ Enable QA for a given label """ evaluator = VideoCollageEvaluator.get_or_create(label) label.qa_enabled = True label.collage_count = collage_count for key, value in kw.iteritems(): setattr(evaluator, key, value) session.flush()
def enable_qa(cls, clf_target, qa_count, **kw): assert isinstance(clf_target, ClassifierTarget), \ "Can only enable ClassifierTargets, got %s" % clf_target evaluator = cls.EVALUATOR_CLS.get_or_create(clf_target.target_label) for key, value in kw.iteritems(): setattr(evaluator, key, value) setattr(clf_target, cls.QA_TYPE, True) setattr(clf_target, cls.QA_COUNT_TYPE, qa_count) session.flush()
def enable_qa(cls, label, screenshot_count, non_preroll_qa_count, **kw): """ Enable QA for a given label """ evaluator = WebPageTextEvaluator.get_or_create(label) label.page_qa_enabled = True label.screenshot_count = screenshot_count label.non_preroll_qa_count = non_preroll_qa_count for key, value in kw.iteritems(): setattr(evaluator, key, value) session.flush()
def update_mturk_results(cls, mt_results): for hit_id, video_id, timestamp, label_id, result in mt_results: mi = cls.query.filter_by(video_id=video_id, timestamp=timestamp, label_id=label_id).first() if mi is None: logger.warn("MTurkImage not found for video_id:%s, timestamp%s, ImageHit.hit_id:%s" %(video_id, timestamp, hit_id)) else: mi.result = result mi.hit.outstanding = False session.flush()
def update_mturk_results(cls, mt_results): for hit_id, box_id, label_id, result in mt_results: mb = cls.query.filter_by(box_id = box_id, label_id = label_id).first() if not mb: logger.warn('MTurkBox not found for box_id:%s, label_id:%s and BoxHit.hit_id:%s' %(box_id, label_id, hit_id)) else: mb.result = result BoxHit.query.filter_by(hit_id=hit_id).update({"outstanding": False}, synchronize_session=False) session.flush()
def update_mturk_results(cls, mt_results): for hit_id, video_id, label_id, result in mt_results: """ take ingested results from MTurk and update results on the DB """ vh = VideoHit.query.filter_by(hit_id = hit_id).first() if vh is None: logger.warn("Hit not found %s", hit_id) else: vh.result = result vh.outstanding = False session.flush()
def update_mturk_results(cls, mt_results): """ take ingested results from MTurk and update results on the DB """ for hit_id, page_id, label_id, result in mt_results: ph = PageHit.query.filter_by(hit_id=hit_id).first() if ph is None: logger.warn("Hit not found %s", hit_id) else: ph.result = result ph.outstanding = False session.flush()
def create_hit(self, **kwargs): """Submit a task to MTurk""" reward_amt = self.reward_amt / 100.0 try: try: template_data = self.format_data(**kwargs) hit_html = self.generate_html(**template_data) hit_id = MTurkUtils.submit_hit( hit_html, self.title, self.description, self.keywords, self.approval_delay, reward_amt, self.duration, self.lifetime, self.max_assignments, require_adult=self.require_adult, min_percent_approved=self.min_percent_approved, min_hits_approved=self.min_hits_approved, require_us=True) except (MTurkRequestError, UnicodeEncodeError): if self.evaluator_type == 'page_text': template_data = self.format_data(process_title=True, **kwargs) hit_html = self.generate_html(**template_data) hit_id = MTurkUtils.submit_hit( hit_html, self.title, self.description, self.keywords, self.approval_delay, reward_amt, self.duration, self.lifetime, self.max_assignments, require_adult=self.require_adult, min_percent_approved=self.min_percent_approved, min_hits_approved=self.min_hits_approved, require_us=True) else: raise except Exception: logger.info('HIT creation failed for %s' % kwargs) tb = traceback.format_exc() + '\n input kwargs: %s' % kwargs if 'AWS.MechanicalTurk.InsufficientFunds' not in tb: MTurkHitFailure(hit_id='Invalid HIT', message=tb) session.flush() raise logger.info('created %s' % hit_id) return hit_id
def update_on_demand_job_status(self, mt_results): from affine.model.mturk import MTurkOnDemandJob for hit_id, job_id, resource_id, label_id, result in mt_results: mj = MTurkOnDemandJob.query.filter_by(resource_id=resource_id, job_id=job_id, hit_id=hit_id).first() if not mj: msg = "MTurkOnDemandJob not found for hit_id:%s, job_id:%s, thumbnail:%s" logger.warn(msg % (hit_id, job_id, resource_id)) else: mj.result = result mj.outstanding = False session.flush()
def set_values(self, **kwargs): bucket = config.get('affine.s3.bucket') try: if self.mock_evaluator: self.min_percent_approved = 0 self.max_assignments = 1 self.min_hits_approved = 0 self.match_threshold = 1 self.require_adult = False session.flush() config.set('affine.s3.bucket', 'affine') f = func(self, **kwargs) return f finally: config.set('affine.s3.bucket', bucket)
def submit_hits(cls): """Submit video labels to MTurk for QA""" evaluators = {} result_set = cls.results_to_qa() num_hits_submitted = 0 for label_id, wp_id, expected_result in result_set: if label_id not in evaluators: evaluators[label_id] = WebPageTextEvaluator.query.filter_by( target_label_id=label_id).one() evaluator = evaluators[label_id] hit_id = evaluator.create_hit(page_id=wp_id) ph = PageHit(hit_id=hit_id, label_id=label_id, page_id=wp_id) num_hits_submitted += 1 session.flush() return PageHit, num_hits_submitted
def register_prev_qa(cls): tdr, ph, wp = TextDetectorResult, PageHit, WebPage base_query = session.query( tdr.page_id, tdr.clf_target_id, ph.hit_id).\ filter(ph.page_id == tdr.page_id) base_query = base_query.join(wp, tdr.page_id == wp.id) for clf_target in cls.enabled_clf_targets(): query = base_query.\ filter(tdr.clf_target_id == clf_target.id, wp.text_detection_update > clf_target.clf.updated_at, ph.label_id == clf_target.target_label_id) query = query.\ outerjoin(cls, and_(cls.clf_target_id == clf_target.id, cls.detector_version == clf_target.clf.updated_at, cls.page_id == tdr.page_id)) for i in query.filter(cls.page_id == None): cls(page_id=i.page_id, detector_version=clf_target.clf.updated_at, clf_target_id=clf_target.id, hit_id=i.hit_id) session.flush()
def create_with_evaluator_and_training_urls(cls, label_id, num_urls, search_kw, **evaluator_kwargs): evluator = ClickableBoxEvaluator.query.filter_by( target_label_id=label_id).first() if not evluator: evaluator_name = "Training Evaluator for %s" % Label.get( label_id).name evluator = ClickableBoxEvaluator(name=evaluator_name, target_label_id=label_id, **evaluator_kwargs) training_job = TrainingJob( label_id=label_id, evaluator=evluator, num_urls=num_urls, search_kw=search_kw, ) session.flush() training_job.create_training_urls() return training_job
def submit_hits(cls, days_since_today=30): """submit hits to Mturk for all boxes that we have verdict for""" min_date = datetime.now() - timedelta(days=days_since_today) num_hits_submitted = 0 results = cls.results_to_qa(min_date) for clf_target, box_ids in results.iteritems(): label_id = clf_target.target_label_id evaluator = ClickableBoxEvaluator.query.filter_by( target_label_id=label_id).one() box_ids_to_submit = [] for box_id in box_ids: mtb = MTurkBox.query.filter_by(label_id=label_id, box_id=box_id).first() if mtb: cls(box_id=box_id, clf_target_id=clf_target.id, mturk_box=mtb) else: box_ids_to_submit.append(box_id) for i in xrange(0, len(box_ids_to_submit), cls.NUM_BOXES_PER_HIT): # slicing all boxes so that we can put "NUM_BOXES_PER_HIT" # on each BoxHit boxes_per_hit = box_ids_to_submit[i:i + cls.NUM_BOXES_PER_HIT] hit_id = evaluator.create_hit(box_ids=boxes_per_hit) session.flush() b = BoxHit(hit_id=hit_id) num_hits_submitted += 1 session.flush() for box_id in boxes_per_hit: mtb = MTurkBox(box_id=box_id, box_hit_id=b.id, label_id=label_id) cls(box_id=box_id, clf_target_id=clf_target.id, mturk_box=mtb) session.flush() return BoxHit, num_hits_submitted
def submit_hits(cls, days=30): """Submit web pages to MTurk for QA""" min_date = datetime.now() - timedelta(days) evaluators = {} cls.register_prev_qa() num_hits_submitted = 0 for clf_target, page_id, clf_updated_at in cls.results_to_qa(min_date): if clf_target.id not in evaluators: label_id = clf_target.target_label_id evaluators[clf_target.id] = WebPageTextEvaluator.query.filter_by( target_label_id=label_id).one() evaluator = evaluators[clf_target.id] label_id = evaluator.target_label_id ph = PageHit.query.filter_by( page_id=page_id, label_id=label_id).first() if not ph: hit_id = evaluator.create_hit(page_id=page_id) ph = PageHit(hit_id=hit_id, label_id=label_id, page_id=page_id) num_hits_submitted += 1 cls(page_id=page_id, detector_version=clf_updated_at, clf_target_id=clf_target.id, hit_id=ph.hit_id) session.flush() return PageHit, num_hits_submitted
def submit_hits(self): """Submit facebox hits from the training video url table to MTurk for QA""" # query all processed urls (TrainingVideoURL table) for boxes # create BoxHits for all boxes and submit hits to Mturk boxes = [] num_hits_submitted = 0 for url in session.query(VideoTrainingURL).filter_by( training_job_id=self.id, processed=False): wpage = WebPage.by_url(url.url) if wpage is not None: # get the video, and set url.processed only if video is updated on its face version videos = sorted(wpage.active_videos, key=lambda x: x.length, reverse=True) if len(videos) != 0: video = videos[0] images_in_s3 = video.s3_timestamps() for b in video.face_boxes: if b.timestamp in images_in_s3: boxes.append(b.id) url.processed = True else: url.processed = True boxes = sorted(set(boxes)) for i in xrange(0, len(boxes), self.NUM_BOXES_PER_HIT): boxes_per_hit = boxes[i:i + self.NUM_BOXES_PER_HIT] hit_id = self.evaluator.create_hit(box_ids=boxes_per_hit) session.flush() b = BoxHit(hit_id=hit_id, training_job_id=self.id) num_hits_submitted += 1 for box_id in boxes_per_hit: if not MTurkBox.query.filter_by( box_id=box_id, label_id=self.label_id).count(): MTurkBox(box_id=box_id, hit=b, label_id=self.label_id) session.flush() return BoxHit, num_hits_submitted
def get_or_create(cls, worker_id): wk = MTurkWorker.query.filter_by(worker_id=worker_id).scalar() if wk is None: wk = MTurkWorker(worker_id=worker_id) session.flush() return wk
def get_or_create(cls, domain, **query_args): query_args['domain'] = domain.lower() entry = cls.query.filter_by(**query_args).first() or cls(**query_args) session.flush() return entry
def block(self, reason=BLOCK_REASON): MTurkUtils.block_worker(self.worker_id, reason) self.blocked_since = datetime.utcnow() session.flush()
def unblock(self, reason=""): MTurkUtils.unblock_worker(self.worker_id, reason) self.blocked_since = None session.flush()