def arrivals(request): params = { 'multichart': False, 'columns': ARRIVALS_COLUMNS, 'title': 'New Tasks/HITs/$$$ per day' } def arrivals_data_formater(input): for cc in input: yield { 'date': cc['start_time'], 'row': (str(cc['hits']), str(cc['reward'])), } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = arrivals_data_formater(query_to_dicts(''' select date as "start_time", arrivals as "hits", arrivals_value as "reward" from main_daystats where date >= '%s' and date <= '%s' ''' % (date_from, date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def completed(request): params = { 'columns': DEFAULT_COLUMNS, 'title': 'Tasks/HITs/$$$ completed per day' } date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat() date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET: date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) params['date_from'] = request.GET['date_from'] params['date_to'] = request.GET['date_to'] data = data_formater(query_to_dicts(''' select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count" from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s' ''' % (date_from, date_to))) params['data'] = data return direct_to_template(request, 'main/graphs/timeline.html', params)
def handle(self, **options): try: commit_threshold = 10 results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)") for i, row in enumerate(results): execute_sql("""insert into hits_mv SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time FROM main_crawl WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted FROM main_hitgroupstatus p JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id WHERE p.crawl_id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s crawls' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value']) execute_sql('rollback;')
def report(request, classes): ctx = {} date_from, date_to = get_time_interval(request.REQUEST, ctx, days_ago=1) page = int(request.REQUEST.get("page", 1)) size = int(request.REQUEST.get("size", 5)) query = \ """ SELECT hmv.crawl_id, hmv.start_time, hgcls.classes, hgcnt.group_id, hgcnt.title, hgcnt.description FROM hits_mv AS hmv JOIN main_hitgroupclass AS hgcls ON hmv.group_id = hgcls.group_id JOIN main_hitgroupcontent AS hgcnt ON hgcls.group_id = hgcnt.group_id WHERE hmv.start_time >= '{}' AND hmv.start_time < '{}' AND hgcls.classes & {} <> 0 ORDER BY start_time ASC LIMIT {} OFFSET {} """.format(date_from, date_to, classes, size, (page - 1) * size) data = query_to_dicts(query) def _data_formatter(input): for cc in input: yield cc[0], cc[1], cc[2] # data = _data_formatter(data) ctx["data"] = data ctx["classes"] = classes ctx["first_page"] = page == 1 ctx["last_page"] = False # TODO finish it. ctx["next_page"] = page + 1 ctx["prev_page"] = page - 1 if size != 5: ctx["size"] = size return direct_to_template(request, 'main/classification_report.html', ctx)
def handle(self, *args, **options): def _to_hit_group_class(results): for result in results: doc = result['document'] prob = result['probabilities'] yield HitGroupClass(group_id=doc['group_id'], classes=NaiveBayesClassifier.most_likely(result), probabilities=json.dumps(prob)) if options['clear_all']: logger.info('Removing all existing classification') # HitGroupClass.objects.all().delete() execute_sql('DELETE FROM main_hitgroupclass;', commit=True) return if options['begin'] and options['end']: # XXX it can be slow. query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content JOIN hits_mv ON content.group_id = hits_mv.group_id WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) AND hits_mv.start_time >= {} AND hits_mv.start_time < {} GROUP BY group_id LIMIT {}; '''.format(options['begin'], options['end'], self.BATCH_SIZE) else: query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) LIMIT {}; '''.format(self.BATCH_SIZE) if not options['classifier_path']: try: options['classifier_path'] = settings.CLASSIFIER_PATH except AttributeError: raise ImproperlyConfigured('Classifier path is not specified ' 'neither in the settings file' 'nor in the command line') with open(options['classifier_path'], 'r') as file: probabilities = json.load(file) classifier = NaiveBayesClassifier(probabilities=probabilities) logger.info('Classification of hit groups started. Processing in '\ 'batches size of {}'.format(self.BATCH_SIZE)) while True: models = query_to_dicts(query) logger.info('Batch classification started') try: results = _to_hit_group_class(classifier.classify_batch(models)) HitGroupClass.objects.bulk_create(results) except EmptyBatchException: logger.info('Batch is empty no hit groups to classify') break logger.info('Batch classified successfully')
def classification(request): data = query_to_dicts( """ SELECT classes, COUNT(classes) number FROM main_hitgroupclass GROUP BY classes; """) data = list(data) sum = 0 for d in data: sum += d["number"] for d in data: d["name"] = NaiveBayesClassifier.label(d["classes"]) d["part"] = 100 * float(d["number"]) / sum params = {"data":data} return direct_to_template(request, 'main/classification.html', params)
def get_crawls(start=None, end=None): """Returns dicts containing crawl ids.""" st = time.time() log.debug("Fetching crawls to process.") extra_query = [] start and end and extra_query.append( "p.start_time BETWEEN '{0}' AND '{1}'".format(start.isoformat(), end.isoformat()) ) query = """SELECT id FROM main_crawl p WHERE p.groups_available * {crawl_threshold} < p.groups_downloaded AND NOT EXISTS (SELECT id FROM main_crawlagregates WHERE crawl_id = p.id) {extra_limit}""" extra_qstr = " AND " + " AND ".join(extra_query) if extra_query else "" query = query.format(extra_limit=extra_qstr, crawl_threshold=settings.INCOMPLETE_CRAWL_THRESHOLD) results = query_to_dicts(query) log.debug("Crawls fetched in {0}s.".format(time.time() - st)) return results
def general(request): params = { 'multichart': True, 'columns': GENERAL_COLUMNS, 'title': 'General Data' } if 'date_from' in request.GET: date_from = datetime.datetime( *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6]) else: date_from = datetime.datetime.now() - datetime.timedelta(days=7) if 'date_to' in request.GET: date_to = datetime.datetime( *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6]) else: date_to = datetime.datetime.now() params['date_from'] = date_from.strftime('%m/%d/%Y') params['date_to'] = date_to.strftime('%m/%d/%Y') data = data_formater(query_to_dicts(''' select reward, hits, projects as "count", spam_projects, start_time from main_crawlagregates where start_time >= %s and start_time <= %s order by start_time asc ''', date_from, date_to)) def _is_anomaly(a, others): mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others) return abs(mid - int(a['row'][0])) > 7000 def _fixer(a, others): val = sum(map(lambda e: int(e['row'][0]), others)) / len(others) a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3]) return a if settings.DATASMOOTHING: params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2) else: params['data'] = list(data) return direct_to_template(request, 'main/graphs/timeline.html', params)
def calculate_first_crawl_id(): progress = 10 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") log.info('got missing ids results') for i, r in enumerate(results): log.info("\tprocessing %s" % r['id']) execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id) where id = %s """ % r['id']) if i % progress == 0: execute_sql('commit;') log.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i) execute_sql('commit;')
def topreq_data_hits_available(days): """Evaluates toprequesters by the number of hits available as an average over the time period. Keyword arguments: days -- number of days to look back """ start_time = datetime.date.today() - datetime.timedelta(int(days)) # We are only interested in records having hits_posted > 0, thus only such # records will appear on the list and max(start_time) should be available at # all times. return list(query_to_dicts(""" SELECT h.requester_id, h.requester_name, count(DISTINCT hitgroup.group_id) as "projects", coalesce(round(CAST (sum(hitgroup.grp_hits) as NUMERIC), 0), 0) as hits, coalesce(sum(hitgroup.grp_hits * h.reward), 0) as reward, max(hitgroup.grp_last_posted) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT mv.group_id, coalesce(avg(mv.hits_available), 0) as "grp_hits", max(mv.start_time) as "grp_last_posted" FROM ( SELECT group_id, hits_available, start_time FROM hits_mv WHERE start_time > '{0}' ) mv GROUP BY mv.group_id ) hitgroup ON h.group_id = hitgroup.group_id WHERE coalesce(p.is_public, true) = true GROUP BY h.requester_id, h.requester_name ORDER BY reward DESC LIMIT 1000;""".format(start_time.isoformat())))
def handle(self, **options): try: commit_threshold = 1000 results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null") for i, row in enumerate(results): execute_sql("""update main_hitgroupcontent p set first_crawl_id = (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1) where id = %s """ % row['id']) if i % commit_threshold == 0: print datetime.datetime.now(), 'commited after %s rows' % i execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value']) execute_sql('rollback;')
def clean_duplicates(): ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;") for id in ids: log.info("Deleting duplicate group %s" % id['group_id']) execute_sql("""delete from main_hitgroupstatus where hit_group_content_id in ( select id from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'); """ % (id['group_id'], id['group_id'])) execute_sql("""delete from main_hitgroupcontent where id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s' """ % (id['group_id'], id['group_id'])) execute_sql('commit;')
def handle(self, *args, **options): def _to_hit_group_class(results): for result in results: doc = result["document"] prob = result["probabilities"] yield HitGroupClass(group_id=doc["group_id"], classes=NaiveBayesClassifier.most_likely(result), probabilities=json.dumps(prob)) if options["remove"]: logger.info("Removing existing classification") HitGroupClass.objects.all().delete() logger.info("Classification removed") return with open(options["input_path"], "r") as file: probabilities = json.load(file) classifier = NaiveBayesClassifier(probabilities=probabilities) # TODO remove this -- for debug purposes only if options["single"] and options["group_id"]: model = HitGroupContent.objects.get(group_id=options['group_id']) print classifier.classify(model) return logger.info("Classification of hit groups started. Processing in "\ "batches size of {}".format(self.BATCH_SIZE)) while True: models = query_to_dicts( """ SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) LIMIT {}; """.format(self.BATCH_SIZE)) logger.info("Batch classification stated") try: results = _to_hit_group_class(classifier.classify_batch(models)) HitGroupClass.objects.bulk_create(results) except EmptyBatchException: logger.info("Batch is empty no hit groups to classify") break logger.info("Batch classified successfully")
def topreq_data_hits_posted(days): """Evaluates toprequesters by the number of hits posted in a period. Keyword arguments: days -- number of days to look back """ start_time = datetime.date.today() - datetime.timedelta(int(days)) # We are only interested in records having hits_posted > 0, thus only such # records will appear on the list and max(start_time) should be available at # all times. return list(query_to_dicts(""" SELECT h.requester_id, h.requester_name, coalesce(count(distinct mv.group_id), 0) as "projects", coalesce(sum(mv.hits_posted), 0) as "hits", coalesce(sum(mv.hits_posted * h.reward), 0) as "reward", max(mv.start_time) as "last_posted" FROM main_hitgroupcontent h LEFT JOIN main_requesterprofile p ON h.requester_id = p.requester_id LEFT JOIN ( SELECT group_id, hits_posted, start_time FROM hits_mv WHERE start_time > '{0}' AND hits_posted > 0 ) mv ON h.group_id = mv.group_id WHERE coalesce(p.is_public, true) = true GROUP BY h.requester_id, h.requester_name ORDER BY reward DESC LIMIT 1000;""".format(start_time.isoformat())))
def general(request, tab_slug=None): tab = GeneralTabEnum.value_for_slug.get(tab_slug, GeneralTabEnum.ALL) ctx = { 'multichart': tab == GeneralTabEnum.ALL, 'columns': GeneralTabEnum.get_graph_columns(tab), 'title': GeneralTabEnum.get_tab_title(tab), 'current_tab': GeneralTabEnum.enum_dict[tab], 'top_tabs': GeneralTabEnum.enum_dict.values(), } date_from, date_to = get_time_interval(request.GET, ctx, days_ago=7) data = data_formater(query_to_dicts(''' SELECT reward, hits, projects as "count", spam_projects, start_time FROM main_crawlagregates WHERE start_time >= %s AND start_time <= %s ORDER BY start_time ASC ''', date_from, date_to)) def _is_anomaly(a, others): mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others) return abs(mid - int(a['row'][0])) > 7000 def _fixer(a, others): val = sum(map(lambda e: int(e['row'][0]), others)) / len(others) a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3]) return a if settings.DATASMOOTHING: ctx['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2) else: ctx['data'] = list(data) ctx['data'] = GeneralTabEnum.data_set_processor[tab](ctx['data']) return direct_to_template(request, 'main/graphs/timeline.html', ctx)
def hit_group_details(request, hit_group_id): try: hit_group = HitGroupContent.objects.get(group_id=hit_group_id) if RequesterProfile.objects.filter(requester_id=hit_group.requester_id, is_public=False): raise HitGroupContent.DoesNotExist() except HitGroupContent.DoesNotExist: messages.info(request, 'Hitgroup with id "{0}" was not found!'.format( hit_group_id)) return redirect('haystack_search') try: hit_group_class = HitGroupClass.objects.get(group_id=hit_group_id) except ObjectDoesNotExist: # TODO classification should be done on all models. hit_group_class = None try: with open(settings.CLASSIFIER_PATH, "r") as file: classifier = NaiveBayesClassifier(probabilities=json.load(file)) classified = classifier.classify(hit_group) most_likely = classifier.most_likely(classified) document = classified["document"] hit_group_class = HitGroupClass( group_id=document.group_id, classes=most_likely, probabilities=classified["probabilities"]) hit_group_class.save() except IOError: # We do not want make hit group details page unavailable when # classifier file does not exist. pass if hit_group_class is not None: hit_group_class_label = NaiveBayesClassifier.label(hit_group_class.classes) else: hit_group_class_label = NaiveBayesClassifier.label() params = { 'multichart': False, 'columns': HIT_DETAILS_COLUMNS, 'title': '#Hits', 'class': hit_group_class_label, } def hit_group_details_data_formater(input): for cc in input: yield { 'date': cc['start_time'], 'row': (str(cc['hits_available']),), } dicts = query_to_dicts( """ select start_time, hits_available from hits_mv where group_id = '{}' order by start_time asc """ .format(hit_group_id)) data = hit_group_details_data_formater(dicts) params['date_from'] = hit_group.occurrence_date params['date_to'] = datetime.datetime.utcnow() params['data'] = data params['hit_group'] = hit_group return direct_to_template(request, 'main/hit_group_details.html', params)
def handle(self, **options): """ Take ${lmit} last crawls without spam classification Classify all hit groups, update hits_mv to have proper hit classification Rebuild crawl_aggregates for a given crawl Refresh memcache """ service = get_prediction_service() pid = Pid('classify_spam', True) transaction.enter_transaction_management() transaction.managed(True) start_time = time.time() try: number_of_predictions = 0 for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]): log.info("processing %s", c) spam = set([]) not_spam = set([]) updated = 0 for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv where crawl_id = %s""", c.id): log.info("classyfing crawl_id: %s, %s", c.id, row) if row['is_spam'] is None: is_spam = None content = HitGroupContent.objects.get(id=row['content_id']) if content.is_spam is None: data = content.prepare_for_prediction() body = {'input': {'csvInstance': data}} prediction = service.predict(body=body, data=options['file']).execute() number_of_predictions += 1 updated += 1 content.is_spam = prediction['outputLabel'] != 'No' content.save() execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id'])) transaction.commit() if content.is_spam: log.info("detected spam for %s", row) spam.add(str(row['content_id'])) else: not_spam.add(str(row['content_id'])) else: log.info("is_spam already computed for %s" % row) if updated > 0: c.is_spam_computed = True c.save() log.info("done classyfing crawl") execute_sql("""UPDATE main_crawlagregates set spam_projects = ( select count(*) from hits_mv where crawl_id = %s and is_spam = true ) where crawl_id = %s""" % (c.id, c.id)) transaction.commit() log.info("dome processing %s", c) except (KeyError, KeyboardInterrupt, HttpError), e: log.error(e) transaction.rollback() pid.remove_pid() exit()
def handle(self, **options): """ take all crawls that have diffs computed from - inf till yesterday start from the oldest crawl and try to create daily stats if there are some crawls without diffs stop the process """ ''' take earliest crawl calculate incosistencies in history calculate data for every missing element ''' crawl = get_first_crawl() if not crawl: log.error("No crawls to process.") return transaction.enter_transaction_management() transaction.managed(True) created = 0 start, end = (crawl.start_day(), datetime.date.today()) days = (end - start).days log.info("Processing {0} to {1} ({2} days).".format(start, end, days)) for i in range(0, days): day = crawl.start_day() + datetime.timedelta(days=i) day_end = day + datetime.timedelta(days=1) crawls = Crawl.objects.filter(has_diffs=False, start_time__gte=day, start_time__lt=day_end) if len(crawls) > 0: log.error("Not all crawls from %s have diffs." % day) continue try: DayStats.objects.get(date=day) except DayStats.DoesNotExist: log.info("Calculating stats for %s" % day) range_start_date = day.isoformat() range_end_date = (day_end).isoformat() log.info("Calculating arrivals.") ''' stats for projects posted on particular day ''' arrivals = query_to_dicts(''' select sum(hits_diff) as "arrivals", sum(hits_diff*reward) as "arrivals_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff > 0 ''' % (range_start_date, range_end_date)).next() if arrivals['arrivals'] == None: log.info('Arrivals is None, skipping.') continue log.info("Calculating processed.") processed = query_to_dicts(''' select sum(hits_diff) as "processed", sum(hits_diff*reward) as "processed_value" from hits_mv p where start_time between TIMESTAMP '%s' and TIMESTAMP '%s' and hits_diff < 0 ''' % (range_start_date, range_end_date)).next() if processed['processed'] == None: log.info('Processed is None, skipping.') continue DayStats.objects.create(date=day, arrivals=arrivals['arrivals'], arrivals_value=arrivals['arrivals_value'], processed=processed['processed'], processed_value=processed['processed_value']) created += 1 transaction.commit() log.info('Finish: {0} DayStat objects were created.'.format(created))
def update_crawl_agregates(commit_threshold=1000, only_new=True): """Creates main_crawlagregates records for hits_mv.""" def print_status(number, row_id): log.info('Commited after %s crawls, last id %s.' % (i, row_id)) results = None query = """SELECT id FROM main_crawl p WHERE {0} groups_available * 0.9 < groups_downloaded AND NOT exists(SELECT id FROM main_crawlagregates WHERE crawl_id = p.id) """ query = query.format('old_id is NULL AND' if only_new else '') results = query_to_dicts(query) log.info("Fetched crawls to process.") i = 0 for i, row in enumerate(results): try: execute_sql(""" INSERT INTO main_crawlagregates (hits, start_time, reward, crawl_id, id, projects, spam_projects) SELECT sum(hits_available) as "hits", start_time, sum(reward * hits_available) as "reward", crawl_id, nextval('main_crawlagregates_id_seq'), count(*) as "count", 0 FROM (SELECT DISTINCT ON (group_id) * FROM hits_mv WHERE crawl_id = %s) AS p GROUP BY crawl_id, start_time """, row['id']) execute_sql("""UPDATE main_crawlagregates set spam_projects = (select count(*) from hits_mv where crawl_id = %s and is_spam = true) where crawl_id = %s""" % (row['id'], row['id'])) if i % commit_threshold == 0: print_status(i, row['id']) execute_sql('commit;') except: error_info = grab_error(sys.exc_info()) log.error('an error occured at crawl_id: %s, %s %s' % ( row['id'], error_info['type'], error_info['value'])) execute_sql('rollback;') if i % commit_threshold != 0: print_status(i, row['id']) execute_sql('commit;') # delete dummy data execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;") execute_sql("COMMIT;")