Пример #1
0
def arrivals(request):

    params = {
        'multichart': False,
        'columns': ARRIVALS_COLUMNS,
        'title': 'New Tasks/HITs/$$$ per day'
    }

    def arrivals_data_formater(input):
        for cc in input:
            yield {
                    'date': cc['start_time'],
                    'row': (str(cc['hits']), str(cc['reward'])),
            }

    date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = arrivals_data_formater(query_to_dicts('''
        select date as "start_time", arrivals as "hits", arrivals_value as "reward"
        from main_daystats where date >= '%s' and date <= '%s'
    ''' % (date_from, date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Пример #2
0
def completed(request):

    params = {
        'columns': DEFAULT_COLUMNS,
        'title': 'Tasks/HITs/$$$ completed per day'
    }

    date_from = (datetime.date.today() - datetime.timedelta(days=30)).isoformat()
    date_to = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()

    if request.method == 'GET' and 'date_from' in request.GET and 'date_to' in request.GET:

        date_from = datetime.datetime(*time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
        date_to = datetime.datetime(*time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
        params['date_from'] = request.GET['date_from']
        params['date_to'] = request.GET['date_to']

    data = data_formater(query_to_dicts('''
        select date as "start_time", day_start_hits - day_end_hits as "hits", day_start_reward - day_end_reward as "reward", day_start_projects - day_end_projects as "count"
            from main_daystats where day_end_hits != 0 and date >= '%s' and date <= '%s'
    ''' % (date_from, date_to)))

    params['data'] = data

    return direct_to_template(request, 'main/graphs/timeline.html', params)
    def handle(self, **options):

        try:

            commit_threshold = 10
            results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)")

            for i, row in enumerate(results):

                execute_sql("""insert into hits_mv
    SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time
               FROM main_crawl
              WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted
       FROM main_hitgroupstatus p
       JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id
      WHERE p.crawl_id = %s
                """ % row['id'])

                if i % commit_threshold == 0:
                    print datetime.datetime.now(), 'commited after %s crawls' % i
                    execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value'])
            execute_sql('rollback;')
Пример #4
0
def report(request, classes):
    ctx = {}
    date_from, date_to = get_time_interval(request.REQUEST, ctx, days_ago=1)
    page = int(request.REQUEST.get("page", 1))
    size = int(request.REQUEST.get("size", 5))
    query = \
    """ SELECT hmv.crawl_id, hmv.start_time, hgcls.classes, 
               hgcnt.group_id, hgcnt.title, hgcnt.description
        FROM hits_mv AS hmv
        JOIN main_hitgroupclass AS hgcls ON hmv.group_id = hgcls.group_id
        JOIN main_hitgroupcontent AS hgcnt ON hgcls.group_id = hgcnt.group_id
        WHERE hmv.start_time >= '{}' AND hmv.start_time < '{}' AND 
              hgcls.classes & {} <> 0
        ORDER BY start_time ASC
        LIMIT {}
        OFFSET {}
    """.format(date_from, date_to, classes, size, (page - 1) * size)
    data = query_to_dicts(query)
    def _data_formatter(input):
        for cc in input:
            yield cc[0], cc[1], cc[2]
    # data = _data_formatter(data)
    ctx["data"] = data
    ctx["classes"] = classes
    ctx["first_page"] = page == 1
    ctx["last_page"] = False # TODO finish it.
    ctx["next_page"] = page + 1
    ctx["prev_page"] = page - 1
    if size != 5:
        ctx["size"] = size
    return direct_to_template(request, 'main/classification_report.html', ctx)
Пример #5
0
 def handle(self, *args, **options):
     def _to_hit_group_class(results):
         for result in results:
             doc = result['document']
             prob = result['probabilities']
             yield HitGroupClass(group_id=doc['group_id'],
                                 classes=NaiveBayesClassifier.most_likely(result),
                                 probabilities=json.dumps(prob))
     if options['clear_all']:
         logger.info('Removing all existing classification')
         # HitGroupClass.objects.all().delete()
         execute_sql('DELETE FROM main_hitgroupclass;', commit=True)
         return
     if options['begin'] and options['end']:
         # XXX it can be slow.
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     JOIN hits_mv 
                     ON content.group_id = hits_mv.group_id
                     WHERE 
                         NOT EXISTS(
                             SELECT * FROM main_hitgroupclass as class
                             WHERE content.group_id = class.group_id
                         ) AND 
                         hits_mv.start_time >= {} AND 
                         hits_mv.start_time < {} 
                     GROUP BY group_id
                     LIMIT {};
                 '''.format(options['begin'], options['end'], self.BATCH_SIZE)
     else:
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     WHERE NOT EXISTS(
                         SELECT * FROM main_hitgroupclass as class
                         WHERE content.group_id = class.group_id
                     ) LIMIT {};
                 '''.format(self.BATCH_SIZE)
     if not options['classifier_path']:
         try:
             options['classifier_path'] = settings.CLASSIFIER_PATH
         except AttributeError:
             raise ImproperlyConfigured('Classifier path is not specified '
                                        'neither in the settings file'
                                        'nor in the command line')
     with open(options['classifier_path'], 'r') as file:
         probabilities = json.load(file)
         classifier = NaiveBayesClassifier(probabilities=probabilities)
         logger.info('Classification of hit groups started. Processing in '\
                     'batches size of {}'.format(self.BATCH_SIZE))
         while True:
             models = query_to_dicts(query)
             logger.info('Batch classification started')
             try:
                 results = _to_hit_group_class(classifier.classify_batch(models))
                 HitGroupClass.objects.bulk_create(results)
             except EmptyBatchException:
                 logger.info('Batch is empty no hit groups to classify')
                 break
             logger.info('Batch classified successfully')
Пример #6
0
def classification(request):
    data = query_to_dicts(
        """ SELECT classes, COUNT(classes) number
            FROM main_hitgroupclass
            GROUP BY classes;
        """)
    data = list(data)
    sum = 0
    for d in data:
        sum += d["number"]

    for d in data:
        d["name"] = NaiveBayesClassifier.label(d["classes"])
        d["part"] = 100 * float(d["number"]) / sum
    params = {"data":data}
    return direct_to_template(request, 'main/classification.html',
                              params)
def get_crawls(start=None, end=None):
    """Returns dicts containing crawl ids."""
    st = time.time()
    log.debug("Fetching crawls to process.")
    extra_query = []
    start and end and extra_query.append(
        "p.start_time BETWEEN '{0}' AND '{1}'".format(start.isoformat(), end.isoformat())
    )
    query = """SELECT id FROM main_crawl p WHERE
        p.groups_available * {crawl_threshold} < p.groups_downloaded AND
        NOT EXISTS (SELECT id FROM main_crawlagregates WHERE crawl_id = p.id)
        {extra_limit}"""
    extra_qstr = " AND " + " AND ".join(extra_query) if extra_query else ""
    query = query.format(extra_limit=extra_qstr, crawl_threshold=settings.INCOMPLETE_CRAWL_THRESHOLD)
    results = query_to_dicts(query)
    log.debug("Crawls fetched in {0}s.".format(time.time() - st))
    return results
Пример #8
0
def general(request):

    params = {
        'multichart': True,
        'columns': GENERAL_COLUMNS,
        'title': 'General Data'
    }

    if 'date_from' in request.GET:
        date_from = datetime.datetime(
                *time.strptime(request.GET['date_from'], '%m/%d/%Y')[:6])
    else:
        date_from = datetime.datetime.now() - datetime.timedelta(days=7)

    if 'date_to' in request.GET:
        date_to = datetime.datetime(
                *time.strptime(request.GET['date_to'], '%m/%d/%Y')[:6])
    else:
        date_to = datetime.datetime.now()

    params['date_from'] = date_from.strftime('%m/%d/%Y')
    params['date_to'] = date_to.strftime('%m/%d/%Y')

    data = data_formater(query_to_dicts('''
        select reward, hits, projects as "count", spam_projects, start_time
            from main_crawlagregates
            where start_time >= %s and start_time <= %s
            order by start_time asc
        ''', date_from, date_to))

    def _is_anomaly(a, others):
        mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        return abs(mid - int(a['row'][0])) > 7000

    def _fixer(a, others):
        val = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3])
        return a

    if settings.DATASMOOTHING:
        params['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2)
    else:
        params['data'] = list(data)

    return direct_to_template(request, 'main/graphs/timeline.html', params)
Пример #9
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
    log.info('got missing ids results')
    for i, r in enumerate(results):
        log.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            log.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i)

    execute_sql('commit;')
Пример #10
0
def topreq_data_hits_available(days):
    """Evaluates toprequesters by the number of hits available as an average
    over the time period.

    Keyword arguments:
    days -- number of days to look back

    """
    start_time = datetime.date.today() - datetime.timedelta(int(days))

    # We are only interested in records having hits_posted > 0, thus only such
    # records will appear on the list and max(start_time) should be available at
    # all times.
    return list(query_to_dicts("""
        SELECT
            h.requester_id,
            h.requester_name,
            count(DISTINCT hitgroup.group_id) as "projects",
            coalesce(round(CAST (sum(hitgroup.grp_hits) as NUMERIC), 0), 0) as hits,
            coalesce(sum(hitgroup.grp_hits * h.reward), 0) as reward,
            max(hitgroup.grp_last_posted) as "last_posted"
        FROM
            main_hitgroupcontent h
            LEFT JOIN main_requesterprofile p
                ON h.requester_id = p.requester_id
            LEFT JOIN (
            SELECT
                mv.group_id,
                coalesce(avg(mv.hits_available), 0) as "grp_hits",
                max(mv.start_time) as "grp_last_posted"
            FROM (
                SELECT group_id, hits_available, start_time
                FROM hits_mv
                WHERE
                    start_time > '{0}'
                ) mv
            GROUP BY mv.group_id
            ) hitgroup
                ON h.group_id = hitgroup.group_id
        WHERE
            coalesce(p.is_public, true) = true
        GROUP BY h.requester_id, h.requester_name
        ORDER BY reward DESC
        LIMIT 1000;""".format(start_time.isoformat())))
    def handle(self, **options):
        try:
            commit_threshold = 1000
            results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")

            for i, row in enumerate(results):
                execute_sql("""update main_hitgroupcontent p set first_crawl_id =
                    (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                    where
                        id = %s
                """ % row['id'])

                if i % commit_threshold == 0:
                    print datetime.datetime.now(), 'commited after %s rows' % i
                    execute_sql('commit;')
        except:
            error_info = grab_error(sys.exc_info())
            print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value'])
            execute_sql('rollback;')
Пример #12
0
def clean_duplicates():

    ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;")

    for id in ids:

        log.info("Deleting duplicate group %s" % id['group_id'])

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
Пример #13
0
 def handle(self, *args, **options):
     def _to_hit_group_class(results):
         for result in results:
             doc = result["document"]
             prob = result["probabilities"]
             yield HitGroupClass(group_id=doc["group_id"],
                                 classes=NaiveBayesClassifier.most_likely(result),
                                 probabilities=json.dumps(prob))
     if options["remove"]:
         logger.info("Removing existing classification")
         HitGroupClass.objects.all().delete()
         logger.info("Classification removed")
         return
     with open(options["input_path"], "r") as file:
         probabilities = json.load(file)
         classifier = NaiveBayesClassifier(probabilities=probabilities)
         # TODO remove this -- for debug purposes only
         if options["single"] and options["group_id"]:
             model = HitGroupContent.objects.get(group_id=options['group_id'])
             print classifier.classify(model)
             return
         logger.info("Classification of hit groups started. Processing in "\
                     "batches size of {}".format(self.BATCH_SIZE))
         while True:
             models = query_to_dicts(
                 """ SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     WHERE NOT EXISTS(
                         SELECT * FROM main_hitgroupclass as class
                         WHERE content.group_id = class.group_id
                     ) LIMIT {};
                 """.format(self.BATCH_SIZE))
             logger.info("Batch classification stated")
             try:
                 results = _to_hit_group_class(classifier.classify_batch(models))
                 HitGroupClass.objects.bulk_create(results)
             except EmptyBatchException:
                 logger.info("Batch is empty no hit groups to classify")
                 break
             logger.info("Batch classified successfully")
Пример #14
0
def topreq_data_hits_posted(days):
    """Evaluates toprequesters by the number of hits posted in a period.

    Keyword arguments:
    days -- number of days to look back

    """
    start_time = datetime.date.today() - datetime.timedelta(int(days))

    # We are only interested in records having hits_posted > 0, thus only such
    # records will appear on the list and max(start_time) should be available at
    # all times.
    return list(query_to_dicts("""
        SELECT
            h.requester_id,
            h.requester_name,
            coalesce(count(distinct mv.group_id), 0) as "projects",
            coalesce(sum(mv.hits_posted), 0) as "hits",
            coalesce(sum(mv.hits_posted * h.reward), 0) as "reward",
            max(mv.start_time) as "last_posted"
        FROM
            main_hitgroupcontent h
            LEFT JOIN main_requesterprofile p
                ON h.requester_id = p.requester_id
            LEFT JOIN (
                SELECT group_id, hits_posted, start_time
                FROM hits_mv
                WHERE
                    start_time > '{0}' AND
                    hits_posted > 0
            ) mv
                ON h.group_id = mv.group_id
        WHERE
            coalesce(p.is_public, true) = true
        GROUP BY h.requester_id, h.requester_name
        ORDER BY reward DESC
        LIMIT 1000;""".format(start_time.isoformat())))
Пример #15
0
def general(request, tab_slug=None):

    tab = GeneralTabEnum.value_for_slug.get(tab_slug, GeneralTabEnum.ALL)

    ctx = {
        'multichart': tab == GeneralTabEnum.ALL,
        'columns': GeneralTabEnum.get_graph_columns(tab),
        'title': GeneralTabEnum.get_tab_title(tab),
        'current_tab': GeneralTabEnum.enum_dict[tab],
        'top_tabs': GeneralTabEnum.enum_dict.values(),
    }

    date_from, date_to = get_time_interval(request.GET, ctx, days_ago=7)

    data = data_formater(query_to_dicts('''
            SELECT reward, hits, projects as "count", spam_projects, start_time
            FROM main_crawlagregates
            WHERE start_time >= %s AND start_time <= %s
            ORDER BY start_time ASC
        ''', date_from, date_to))

    def _is_anomaly(a, others):
        mid = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        return abs(mid - int(a['row'][0])) > 7000

    def _fixer(a, others):
        val = sum(map(lambda e: int(e['row'][0]), others)) / len(others)
        a['row'] = (str(val), a['row'][1], a['row'][2], a['row'][3])
        return a

    if settings.DATASMOOTHING:
        ctx['data'] = plot.repair(list(data), _is_anomaly, _fixer, 2)
    else:
        ctx['data'] = list(data)

    ctx['data'] = GeneralTabEnum.data_set_processor[tab](ctx['data'])
    return direct_to_template(request, 'main/graphs/timeline.html', ctx)
Пример #16
0
def hit_group_details(request, hit_group_id):

    try:
        hit_group = HitGroupContent.objects.get(group_id=hit_group_id)
        if RequesterProfile.objects.filter(requester_id=hit_group.requester_id,
            is_public=False):
            raise HitGroupContent.DoesNotExist()
    except HitGroupContent.DoesNotExist:
        messages.info(request, 'Hitgroup with id "{0}" was not found!'.format(
            hit_group_id))
        return redirect('haystack_search')

    try:
        hit_group_class = HitGroupClass.objects.get(group_id=hit_group_id)
    except ObjectDoesNotExist:
        # TODO classification should be done on all models.
        hit_group_class = None
        try:
            with open(settings.CLASSIFIER_PATH, "r") as file:
                classifier = NaiveBayesClassifier(probabilities=json.load(file))
                classified = classifier.classify(hit_group)
                most_likely = classifier.most_likely(classified)
                document = classified["document"]
                hit_group_class = HitGroupClass(
                        group_id=document.group_id,
                        classes=most_likely,
                        probabilities=classified["probabilities"])
                hit_group_class.save()
        except IOError:
            # We do not want make hit group details page unavailable when
            # classifier file does not exist.
            pass

    if hit_group_class is not None:
        hit_group_class_label = NaiveBayesClassifier.label(hit_group_class.classes)
    else:
        hit_group_class_label = NaiveBayesClassifier.label()

    params = {
        'multichart': False,
        'columns': HIT_DETAILS_COLUMNS,
        'title': '#Hits',
        'class': hit_group_class_label,
    }

    def hit_group_details_data_formater(input):
        for cc in input:
            yield {
                'date': cc['start_time'],
                'row': (str(cc['hits_available']),),
            }

    dicts = query_to_dicts(
                """ select start_time, hits_available from hits_mv
                    where group_id = '{}' order by start_time asc """
                .format(hit_group_id))
    data = hit_group_details_data_formater(dicts)
    params['date_from'] = hit_group.occurrence_date
    params['date_to'] = datetime.datetime.utcnow()
    params['data'] = data
    params['hit_group'] = hit_group
    return direct_to_template(request, 'main/hit_group_details.html', params)
Пример #17
0
    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv
                    where
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates
                    set spam_projects =
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()
    def handle(self, **options):

        """
        take all crawls that have diffs computed from - inf till yesterday
        start from the oldest crawl and try to create daily stats
        if there are some crawls without diffs stop the process
        """

        '''
        take earliest crawl
        calculate incosistencies in history
        calculate data for every missing element
        '''
        crawl = get_first_crawl()
        if not crawl:
            log.error("No crawls to process.")
            return

        transaction.enter_transaction_management()
        transaction.managed(True)
        created = 0

        start, end = (crawl.start_day(), datetime.date.today())
        days = (end - start).days
        log.info("Processing {0} to {1} ({2} days).".format(start, end, days))

        for i in range(0, days):

            day = crawl.start_day() + datetime.timedelta(days=i)
            day_end = day + datetime.timedelta(days=1)

            crawls = Crawl.objects.filter(has_diffs=False, start_time__gte=day,
                start_time__lt=day_end)
            if len(crawls) > 0:
                log.error("Not all crawls from %s have diffs." % day)
                continue

            try:
                DayStats.objects.get(date=day)
            except DayStats.DoesNotExist:
                log.info("Calculating stats for %s" % day)

                range_start_date = day.isoformat()
                range_end_date = (day_end).isoformat()

                log.info("Calculating arrivals.")
                '''
                stats for projects posted on particular day
                '''
                arrivals = query_to_dicts('''
                    select
                        sum(hits_diff) as "arrivals",
                        sum(hits_diff*reward) as "arrivals_value"
                    from
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff > 0
                    ''' % (range_start_date, range_end_date)).next()
                if arrivals['arrivals'] == None:
                    log.info('Arrivals is None, skipping.')
                    continue

                log.info("Calculating processed.")
                processed = query_to_dicts('''
                    select
                        sum(hits_diff) as "processed",
                        sum(hits_diff*reward) as "processed_value"
                    from
                        hits_mv p
                    where
                     start_time between TIMESTAMP '%s' and TIMESTAMP '%s'
                        and hits_diff < 0
                    ''' % (range_start_date, range_end_date)).next()
                if processed['processed'] == None:
                    log.info('Processed is None, skipping.')
                    continue

                DayStats.objects.create(date=day,
                    arrivals=arrivals['arrivals'],
                    arrivals_value=arrivals['arrivals_value'],
                    processed=processed['processed'],
                    processed_value=processed['processed_value'])
                created += 1
                transaction.commit()

        log.info('Finish: {0} DayStat objects were created.'.format(created))
Пример #19
0
def update_crawl_agregates(commit_threshold=1000, only_new=True):
    """Creates main_crawlagregates records for hits_mv."""

    def print_status(number, row_id):
        log.info('Commited after %s crawls, last id %s.' % (i, row_id))

    results = None

    query = """SELECT id FROM main_crawl p WHERE {0}
        groups_available * 0.9 < groups_downloaded AND
        NOT exists(SELECT id FROM main_crawlagregates WHERE crawl_id = p.id)
    """
    query = query.format('old_id is NULL AND' if only_new else '')
    results = query_to_dicts(query)

    log.info("Fetched crawls to process.")

    i = 0
    for i, row in enumerate(results):
        try:
            execute_sql("""
            INSERT INTO
                main_crawlagregates (hits, start_time, reward, crawl_id, id,
                    projects, spam_projects)
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                0
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv
                WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """, row['id'])

            execute_sql("""UPDATE main_crawlagregates
                set spam_projects =
                    (select count(*) from hits_mv
                    where crawl_id = %s and is_spam = true)
                where crawl_id = %s""" % (row['id'], row['id']))

            if i % commit_threshold == 0:
                print_status(i, row['id'])
                execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            log.error('an error occured at crawl_id: %s, %s %s' % (
                row['id'], error_info['type'], error_info['value']))
            execute_sql('rollback;')

    if i % commit_threshold != 0:
        print_status(i, row['id'])
        execute_sql('commit;')

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;")
    execute_sql("COMMIT;")