def handle(self, **options): pid = Pid('mturk_diffs', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: for c in Crawl.objects.filter( is_spam_computed=False).order_by('-id')[:options['limit']]: updated = update_cid(c.id) if updated > 0: c.has_diffs = True c.save() transaction.commit() except (KeyError, KeyboardInterrupt): transaction.rollback() pid.remove_pid() exit() logger.info('updating 5 crawls took: %s s', (time.time() - start_time))
def handle(self, **options): pid = Pid('mturk_diffs', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: for c in Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]: updated = update_cid(c.id) if updated > 0: c.has_diffs=True c.save() transaction.commit() except (KeyError, KeyboardInterrupt): transaction.rollback() pid.remove_pid() exit() logger.info('updating 5 crawls took: %s s', (time.time() - start_time))
def handle(self, **options): pid = Pid('mturk_agregates', True) start_time = time.time() logging.info('Updating crawl agregates') update_crawl_agregates(1, only_new = True) logging.info('db_update_agregates took: %s' % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_agregates', True) start_time = time.time() logging.info('Updating crawl agregates') update_crawl_agregates(1, only_new=True) logging.info('db_update_agregates took: %s' % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_crawler', True) start_time = time.time() logging.info('cleaning up db from duplicates') clean_duplicates() logging.info('Refreshing hits_mv') update_mviews() logging.info('done refreshing hits_mv') logging.info('db_refresh_mviews took: %s' % (time.time() - start_time)) pid.remove_pid()
def handle(self, **options): pid = Pid('mturk_agregates', True) key = 'TOPREQUESTERS_CACHED' result = cache.get(key) if result is not None: logging.info("toprequesters still in cache...") return days = options['days'] logging.info("toprequesters missing, refetching") # no chache perform query: from mturk.main.views import topreq_data start_time = time.time() data = topreq_data(days) logging.info("toprequesters: filled memcache in %s", time.time() - start_time) cache.set(key, data, HOURS4) pid.remove_pid()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id,row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get(id= row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict(body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed=True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id) ) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def run(self): pid = Pid('mturk_crawler', True) logging.info('Crawler started') start_time = datetime.datetime.now() #Fetching statistical information about groups and HITs count logging.debug("Fetching stats") main_response = urllib2.urlopen(get_allhit_url()) main_html = main_response.read() main_soup = BeautifulSoup( main_html, parseOnlyThese=SoupStrainer( text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)"))) main_stats = [tag for tag in main_soup] hits_available = -1 groups_available = -1 if len(main_stats) > 1: hits_available_tmp = main_stats[0] hits_available_tmp = hits_available_tmp[:hits_available_tmp. find(' ')].replace( ',', '') hits_available = int(hits_available_tmp) groups_available_tmp = main_stats[1] groups_available_tmp = groups_available_tmp[ groups_available_tmp.find('of') + 3:groups_available_tmp.find('Results') - 1] groups_available = int(groups_available_tmp) main_soup = None #Fetching data from every mturk.com HITs list page logging.debug("Allhit processing") result_allhit = self.process_values( range(1, self.get_max_page(main_html) + 1), callback_allhit, self.processes_count) self.data = result_allhit['data'] self.append_errors(result_allhit['errors']) #Fetching html details for every HIT group logging.debug("Details processing") result_details = self.process_values(self.data, callback_details, self.processes_count) self.data = result_details['data'] self.append_errors(result_details['errors']) hits_downloaded = sum( [hgs['HitGroupStatus']['hits_available'] for hgs in self.data]) groups_downloaded = len(self.data) #Logging crawl information into the database success = False if groups_downloaded > 0 and hits_downloaded > 0 and groups_available / groups_downloaded <= 1.5 and hits_available / hits_downloaded <= 1.5: success = True logging.debug( "Crawl finished with success=%s. Saving main_crawl entry" % success) crawl = Crawl( **{ 'start_time': start_time, 'end_time': datetime.datetime.now(), 'success': success, 'hits_available': hits_available, 'hits_downloaded': hits_downloaded, 'groups_available': groups_available, 'groups_downloaded': groups_downloaded, #'errors': str(self.errors) # ! 'errors': '' }) crawl.save() #Adding crawl FK logging.debug("Adding FKs") result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, crawl=crawl) self.data = result_add_crawlfk['data'] self.append_errors(result_add_crawlfk['errors']) #Saving results in the database logging.debug("Saving results") result_save_database = self.process_values(self.data, callback_database) self.append_errors(result_save_database['errors']) print self.errors logging.info( "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % ("" if success else "un", (datetime.datetime.now() - start_time), groups_downloaded, hits_downloaded, groups_available, hits_available, len(self.errors))) pid.remove_pid()
def run(self): pid = Pid('mturk_crawler', True) logging.info('Crawler started') start_time = datetime.datetime.now() #Fetching statistical information about groups and HITs count logging.debug("Fetching stats") main_response = urllib2.urlopen(get_allhit_url()) main_html = main_response.read() main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)"))) main_stats = [tag for tag in main_soup] hits_available = -1 groups_available = -1 if len(main_stats) > 1: hits_available_tmp = main_stats[0] hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '') hits_available = int(hits_available_tmp) groups_available_tmp = main_stats[1] groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1] groups_available = int(groups_available_tmp) main_soup = None #Fetching data from every mturk.com HITs list page logging.debug("Allhit processing") result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, self.processes_count) self.data = result_allhit['data'] self.append_errors(result_allhit['errors']) #Fetching html details for every HIT group logging.debug("Details processing") result_details = self.process_values(self.data, callback_details, self.processes_count) self.data = result_details['data'] self.append_errors(result_details['errors']) hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data]) groups_downloaded = len(self.data) #Logging crawl information into the database success = False if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5: success = True logging.debug("Crawl finished with success=%s. Saving main_crawl entry" % success) crawl = Crawl(**{ 'start_time': start_time, 'end_time': datetime.datetime.now(), 'success': success, 'hits_available': hits_available, 'hits_downloaded': hits_downloaded, 'groups_available': groups_available, 'groups_downloaded': groups_downloaded, #'errors': str(self.errors) # ! 'errors': '' }) crawl.save() #Adding crawl FK logging.debug("Adding FKs") result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, crawl=crawl) self.data = result_add_crawlfk['data'] self.append_errors(result_add_crawlfk['errors']) #Saving results in the database logging.debug("Saving results") result_save_database = self.process_values(self.data, callback_database) self.append_errors(result_save_database['errors']) print self.errors logging.info( "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % ( "" if success else "un", (datetime.datetime.now()-start_time), groups_downloaded, hits_downloaded, groups_available, hits_available, len(self.errors) ) ) pid.remove_pid()
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list( Crawl.objects.filter(is_spam_computed=False).order_by( '-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts( """select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id, row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get( id=row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict( body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql( "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed = True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id)) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()