def delete_crawl_agregates(self, ids):
     """This will be done separately, as it'a a much faster query with
     immediately visible effect.
     """
     qq = self.__get_delete_queries(['main_crawlagregates'], 'in').next()
     execute_sql(qq.format(self.__chunk_str(ids)))
     transaction.commit_unless_managed()
Exemplo n.º 2
0
def update_diffs(limit=100):
    start_time = time.time()
    execute_sql("""
        UPDATE hits_mv
            SET hits_diff = diffs.hits_diff
        FROM
            (SELECT
                group_id, crawl_id,
                    hits_available -
                    COALESCE(lag(hits_available) over (partition BY group_id ORDER BY crawl_id), 0)
                    AS hits_diff
            FROM hits_mv
            WHERE
                hits_diff is NULL
            AND crawl_id in
                (SELECT DISTINCT crawl_id
                FROM hits_mv
                WHERE hits_diff is NULL
                ORDER BY crawl_id DESC LIMIT %s)

            ) AS diffs

        WHERE (diffs.group_id = hits_mv.group_id) AND (diffs.crawl_id = hits_mv.crawl_id);""",
        (int(limit), ))
    execute_sql('commit;')

    log.debug('Updated diffs for %s crawls in %s\n%s', limit, (time.time() - start_time))
Exemplo n.º 3
0
def update_mviews(clear_existing=True, force=False, start=None, end=None):
    """Creates hits_mv records for crawls with enough groups_downloaded
    taking data from main_hitgroupstatus and main_hitgroupcontent tables.

    Filtering out incomplete anonymous crawls
    -----------------------------------------
    Note: this is covered by the latter case

    There is a 20 page limit for anonymous mturk.com users, each page having
    10 hitgroups, while usual number of hitgroups is at least 100.
    Thus all crawls with 200 or less crawls should be excluded.

    Filtering erronous crawls
    -------------------------
    Usually, the difference between hitgroups_downloaded and higroups_available
    exceeding 10 percent denotes a crawl error. Such crawls should be excluded
    from creating hits_mv and further table records.

    """
    for crawl_id, start_time in get_crawls_for_update(force, start, end):

        if clear_existing:
            log.info("Deleting hits_mv records for: {0}".format(crawl_id))
            execute_sql("DELETE FROM hits_mv WHERE crawl_id = {0}".format(
                crawl_id), commit=True)

        log.info("Creating hits_mv records for: {0}.".format(crawl_id))
        create_hits_mv_record(start_time, crawl_id)
Exemplo n.º 4
0
def update_first_occured_agregates():

    missing_crawls = query_to_tuples("""select id from main_crawl p where p.success = true and not exists (select crawl_id from main_hitgroupfirstoccurences where crawl_id = p.id );""")

    for row in missing_crawls:

        crawl_id = row[0]
        log.info("inserting missing crawl into main_hitgroupfirstoccurences: %s" % crawl_id)

        execute_sql("""INSERT INTO
                main_hitgroupfirstoccurences (reward, group_content_id,
                    crawl_id, requester_name, group_status_id, occurrence_date,
                    requester_id, group_id, hits_available, id)
                    select
                        p.reward,
                        p.id,
                        q.crawl_id,
                        p.requester_name,
                        q.id,
                        p.occurrence_date,
                        p.requester_id,
                        p.group_id,
                        q.hits_available,
                        nextval('main_hitgroupfirstoccurences_id_seq'::regclass)
                    from main_hitgroupcontent p join main_hitgroupstatus q
                        on( p.first_crawl_id = q.crawl_id and q.hit_group_content_id = p.id )
                        where q.crawl_id = %s;""" % crawl_id)

        execute_sql('commit;')
Exemplo n.º 5
0
def create_hits_mv_record(start_time, crawl_id):
    """Aggregates main_hitgroupstatus/main_hitgroupcontent data into hits_mv
    and marks crawl as already processed."""
    execute_sql("""INSERT INTO
            hits_mv (status_id, content_id, group_id, crawl_id, start_time,
                requester_id, hits_available, page_number, inpage_position,
                hit_expiration_date, reward, time_alloted, hits_diff,
                is_spam)
        SELECT p.id AS status_id, q.id AS content_id, p.group_id,
            p.crawl_id, TIMESTAMP '{start_time}',
            q.requester_id, p.hits_available, p.page_number,
            p.inpage_position, p.hit_expiration_date, q.reward,
            q.time_alloted, null, q.is_spam
        FROM
            main_hitgroupstatus p
        JOIN
            main_hitgroupcontent q ON (q.group_id::text = p.group_id::text
                                       AND p.hit_group_content_id = q.id)
        WHERE
            p.crawl_id = {crawl_id};
    """.format(start_time=start_time, crawl_id=crawl_id), commit=True)

    execute_sql(("UPDATE main_crawl SET has_hits_mv = true WHERE id = {0};"
        ).format(crawl_id), commit=True)

    transaction.commit_unless_managed()
Exemplo n.º 6
0
 def handle(self, *args, **options):
     def _to_hit_group_class(results):
         for result in results:
             doc = result['document']
             prob = result['probabilities']
             yield HitGroupClass(group_id=doc['group_id'],
                                 classes=NaiveBayesClassifier.most_likely(result),
                                 probabilities=json.dumps(prob))
     if options['clear_all']:
         logger.info('Removing all existing classification')
         # HitGroupClass.objects.all().delete()
         execute_sql('DELETE FROM main_hitgroupclass;', commit=True)
         return
     if options['begin'] and options['end']:
         # XXX it can be slow.
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     JOIN hits_mv 
                     ON content.group_id = hits_mv.group_id
                     WHERE 
                         NOT EXISTS(
                             SELECT * FROM main_hitgroupclass as class
                             WHERE content.group_id = class.group_id
                         ) AND 
                         hits_mv.start_time >= {} AND 
                         hits_mv.start_time < {} 
                     GROUP BY group_id
                     LIMIT {};
                 '''.format(options['begin'], options['end'], self.BATCH_SIZE)
     else:
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     WHERE NOT EXISTS(
                         SELECT * FROM main_hitgroupclass as class
                         WHERE content.group_id = class.group_id
                     ) LIMIT {};
                 '''.format(self.BATCH_SIZE)
     if not options['classifier_path']:
         try:
             options['classifier_path'] = settings.CLASSIFIER_PATH
         except AttributeError:
             raise ImproperlyConfigured('Classifier path is not specified '
                                        'neither in the settings file'
                                        'nor in the command line')
     with open(options['classifier_path'], 'r') as file:
         probabilities = json.load(file)
         classifier = NaiveBayesClassifier(probabilities=probabilities)
         logger.info('Classification of hit groups started. Processing in '\
                     'batches size of {}'.format(self.BATCH_SIZE))
         while True:
             models = query_to_dicts(query)
             logger.info('Batch classification started')
             try:
                 results = _to_hit_group_class(classifier.classify_batch(models))
                 HitGroupClass.objects.bulk_create(results)
             except EmptyBatchException:
                 logger.info('Batch is empty no hit groups to classify')
                 break
             logger.info('Batch classified successfully')
def clear_existing_rows(start, end):
    t = time.time()
    log.info("Deleting crawl agregates for period {0} to {1}".format(start, end))
    execute_sql(
        "DELETE FROM main_crawlagregates WHERE start_time BETWEEN '{0}'"
        " and '{1}'".format(start.isoformat(), end.isoformat()),
        commit=True,
    )
    log.info("Deleting crawl agregates for period {0} to {1} took {2}s.".format(start, end, t))
Exemplo n.º 8
0
def __create_extra_tables():
    """No automation here, just add lines for extra tables wanted."""
    for table_name, table_sql in EXTRA_TABLES.iteritems():
        path = os.path.join(SQL_PATH, table_sql)
        if table_exists(table_name):
            log.warning('Table {0} exists.'.format(table_name))
        else:
            execute_sql(open(path).read())
            log.info('Creating table {0}.'.format(table_name))
    transaction.commit_unless_managed()
    def update_interrupted_crawl_stats(self):
        """Checks for crawls that were interrupted before updating the number
        of downloaded hit groups.

        Currently those crawls can be found by checking for crawls having
        groups_downloaded = groups_available, since at the begining of the crawl
        those values are set to be equal.

        """
        execute_sql(self.recount_query)
        transaction.commit_unless_managed()
    def clear_past_results(self, start, end):
        """Removes records from hits_mv and main_crawlaggregates tables matching
        the given time period.

        """
        for table in ['hits_mv', 'main_crawlagregates']:
            st = time.time()
            self.log.info('Deleting rows from {0} where start_time between {1}'
                ' and {2}.'.format(table, start, end))
            q = "DELETE FROM {0} where start_time BETWEEN '{1}' AND '{2}';"
            execute_sql(q.format(table, start, end), commit=True)
            self.log.info('{0}s elapsed.'.format(time.time() - st))
    def update_interrupted_crawl_stats(self):
        """Checks for crawls that were interrupted before updating the number
        of downloaded hit groups.

        Currently those crawls can be found by checking for crawls having
        groups_downloaded = groups_available, since at the begining of the crawl
        those values are set to be equal.

        """
        log.info('--fix-interrupted specified, updating interrupted crawls to '
            'have groups_downloaded match the actual downloaded object count.')
        execute_sql(self.recount_query)
        transaction.commit_unless_managed()
Exemplo n.º 12
0
def update_mviews():
    """Creates hits_mv records for crawls with enough groups_downloaded
    taking data from main_hitgroupstatus and main_hitgroupcontent tables.

    Filtering out incomplete anonymous crawls
    -----------------------------------------
    Note: this is covered by the latter case

    There is a 20 page limit for anonymous mturk.com users, each page having
    10 hitgroups, while usual number of hitgroups is at least 100.
    Thus all crawls with 200 or less crawls should be excluded.

    Filtering erronous crawls
    -------------------------
    Usually, the difference between hitgroups_downloaded and higroups_available
    exceeding 10 percent denotes a crawl error. Such crawls should be excluded
    from creating hits_mv and further table records.

    """
    query = """
    select id, start_time from main_crawl p
    where
        p.success = true and old_id is null and has_hits_mv = false and
        not exists (select id from main_crawlagregates where crawl_id = p.id)
        and p.groups_available * 0.9 < p.groups_downloaded
    order by id desc"""

    missing_crawls = query_to_tuples(query)

    for row in missing_crawls:

        crawl_id, start_time = row

        log.info("inserting missing crawl: %s" % crawl_id)

        execute_sql("delete from hits_mv where crawl_id = %s" % crawl_id)

        execute_sql("""INSERT INTO
                hits_mv (status_id, content_id, group_id, crawl_id, start_time,
                    requester_id, hits_available, page_number, inpage_position,
                    hit_expiration_date, reward, time_alloted, hits_diff,
                    is_spam)
            SELECT p.id AS status_id, q.id AS content_id, p.group_id,
                p.crawl_id, TIMESTAMP '%s',
                q.requester_id, p.hits_available, p.page_number,
                p.inpage_position, p.hit_expiration_date, q.reward,
                q.time_alloted, null, q.is_spam
            FROM
                main_hitgroupstatus p
            JOIN
                main_hitgroupcontent q ON (q.group_id::text = p.group_id::text
                                           AND p.hit_group_content_id = q.id)
            WHERE
                p.crawl_id = %s;
        """ % (start_time, crawl_id))

        execute_sql(("update main_crawl set has_hits_mv = true where"
            " id = %s") % crawl_id)

        execute_sql('commit;')
Exemplo n.º 13
0
def updatehitgroup(g, cid):
    """Updates hits_mv.hits_available by subtracting current group's
    hits_available.

    """

    prev = execute_sql("""select hits_available from hits_mv
                where
                    crawl_id between %s and %s and
                    group_id = '%s'
                order by crawl_id desc
                limit 1;""" % (cid - 100, cid - 1, g)).fetchall()
    prev = prev[0][0] if prev else 0

    execute_sql("""update hits_mv set hits_diff = hits_available - %s where
            group_id = '%s' and crawl_id = %s;""" % (prev, g, cid))
    def do_deletes_chunked(self, ids):
        """More complex version, does multiple crawls at a time."""
        processed = 0
        qs = list(self.__get_delete_queries(['hits_mv', 'hits_temp'], 'in'))
        for chunk in self.read_chunks(
                ids, limit=self.limit, chunk_size=self.chunk_size):
            chunk_str = self.__chunk_str(chunk)
            for q in qs:
                execute_sql(q.format(chunk_str))
            execute_sql(("update main_crawl set has_hits_mv = false where"
                " id in {0}").format(chunk_str))
            processed += len(chunk)
            log.info('Processed crawls: {0}, {1}/{2} in {3}s.'.format(
                chunk, processed, self.crawl_count, self.time_elapsed()))
            transaction.commit_unless_managed()

        return processed
Exemplo n.º 15
0
    def clear_past_results(self):
        """Clears results in hits_mv and main_crawlagregates tables.

        Tables cleared:
        hits_mv -- hits_posted and hits_consumed
        main_crawlagregates -- hits_consumed, hits_posted, hitgroups_posted,
        hitgroups_consumed

        """
        self.log.info("Clearing existiting hits_mv columns.")
        clear_time = time.time()
        execute_sql(
            """UPDATE hits_mv SET hits_posted = 0, hits_consumed = 0
        WHERE
            crawl_id IN (
                SELECT id FROM main_crawl
                WHERE start_time BETWEEN '{0}' AND '{1}'
            ) AND
            hits_posted > 0 OR hits_consumed > 0;
        """.format(
                self.start.isoformat(), self.end.isoformat()
            ),
            commit=True,
        ).close()
        self.log.info("{0}s elapsed.".format(time.time() - clear_time))

        self.log.info("Clearing existiting main_crawlagregates columns.")
        clear_time = time.time()
        execute_sql(
            """UPDATE main_crawlagregates
        SET
            hits_posted = 0, hits_consumed = 0,
            hitgroups_posted = 0, hitgroups_consumed = 0
        WHERE
            crawl_id IN (
                SELECT id FROM main_crawl
                WHERE start_time BETWEEN '{0}' AND '{1}'
            ) AND (
            hits_posted > 0 OR hits_consumed > 0 OR
            hitgroups_consumed > 0 OR hitgroups_posted > 0);
        """.format(
                self.start.isoformat(), self.end.isoformat()
            ),
            commit=True,
        ).close()
        self.log.info("{0}s elapsed.".format(time.time() - clear_time))
    def do_deletes_simple(self, ids):
        """Performs a query per crawl and per table."""
        qs = list(self.__get_delete_queries(['hits_mv', 'hits_temp'], '='))
        for i, crawl_id in enumerate(ids, start=1):
            if self.limit and i > self.limit:
                break
            for q in qs:
                execute_sql(q.format(crawl_id))
            if i % 10 == 0:
                log.info(("{0}/{1} crawls processed, {2}s elapsed so far."
                    ).format(i, self.crawl_count, self.time_elapsed()))
                transaction.commit_unless_managed()
            execute_sql(("update main_crawl set has_hits_mv = false where"
                " id = {0}").format(crawl_id))

        transaction.commit_unless_managed()
        return i
    def handle(self, **options):

        update_crawl_agregates(only_new=False)

        f = open(os.path.join(settings.ROOT_PATH, 'crawl.errors.csv'), "rb")
        progress = 10

        execute_sql("update main_crawl set success = true where old_id is not null")

        for i, id in enumerate(f):
            id = id.strip()
            execute_sql("delete from main_crawlagregates where crawl_id = (select id from main_crawl where old_id = %s)" % id)
            execute_sql("update main_crawl set success = false where old_id = %s" % id)

            if i % progress == 0:
                print "processed %s rows" % i
                execute_sql("commit;")

        execute_sql("commit;")
Exemplo n.º 18
0
def update_cid(cid):
    """Updates hits_diff on hits_mv record related to crawl with id equal to
    cid.

    """
    st = time.time()
    count = 0
    for i, g in enumerate(query_to_tuples("select distinct group_id from hits_mv where crawl_id = %s", cid)):
        g = g[0]
        if i == 0:
            log.info("processing %s, %s %s", i, cid,  g)

        updatehitgroup(g, cid)
        count += 1

    execute_sql("commit;")

    log.info("Updated crawl {0} in {1}. {2} groups processed".format(
        cid, time.time() - st, count))

    return count
Exemplo n.º 19
0
    def handle(self, **options):

        try:

            commit_threshold = 10
            results = query_to_dicts("select id from main_crawl p where not exists(select id from hits_mv where crawl_id = p.id)")

            for i, row in enumerate(results):

                execute_sql("""insert into hits_mv
    SELECT p.id AS status_id, q.id AS content_id, p.group_id, p.crawl_id, ( SELECT main_crawl.start_time
               FROM main_crawl
              WHERE main_crawl.id = p.crawl_id) AS start_time, q.requester_id, p.hits_available, p.page_number, p.inpage_position, p.hit_expiration_date, q.reward, q.time_alloted
       FROM main_hitgroupstatus p
       JOIN main_hitgroupcontent q ON q.group_id::text = p.group_id::text AND p.hit_group_content_id = q.id
      WHERE p.crawl_id = %s
                """ % row['id'])

                if i % commit_threshold == 0:
                    print datetime.datetime.now(), 'commited after %s crawls' % i
                    execute_sql('commit;')

        except:
            error_info = grab_error(sys.exc_info())
            print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value'])
            execute_sql('rollback;')
Exemplo n.º 20
0
def __create_procedures():
    """Executes create or replace for each procedure mentioned in
    PROCEDURES_TO_CREATE.

    Each procedure is defined by a filename and a procedure creating function.
    Filename with extension stripped will be the new procedure's name and the
    file's content it's body.

    The create function should take procedure's name and the content of it's
    file. Simplest such function is create_no_args which simply wraps procedure
    code in a correct CREATE OR REPLACE FUNCTION statement.

    """
    sqls = []
    for prfile, method in PROCEDURES_TO_CREATE.iteritems():
        prname = os.path.splitext(os.path.basename(prfile))[0]
        sqlpath = os.path.join(SQL_PATH, prfile)
        log.info('Creating procedure {prname} from file: {sqlpath}'.format(
            prname=prname, sqlpath=sqlpath))
        sqls.append(method(prname, open(sqlpath).read()))
    execute_sql('\n'.join(sqls))
    transaction.commit_unless_managed()
Exemplo n.º 21
0
def topreq_data_hits_posted_crawl_id(days):
    """Evaluates toprequesters by the number of hits posted in a period.

    Keyword arguments:
    days -- number of days to look back

    """
    # select the first crawl considered by start_time
    firstcrawl = execute_sql("""
        SELECT crawl_id
        FROM hits_mv
        WHERE
            start_time > %s
        ORDER BY start_time ASC
        LIMIT 1;""", datetime.date.today() - datetime.timedelta(int(days))
        ).fetchall()[0][0]

    # We are only interested in records having hits_posted > 0, thus only such
    # records will appear on the list and max(start_time) should be available at
    # all times.
    return list(query_to_tuples("""
        SELECT
            h.requester_id,
            h.requester_name,
            coalesce(count(distinct mv.group_id), 0) as "projects",
            coalesce(sum(mv.hits_posted), 0) as "hits",
            coalesce(sum(mv.hits_posted * h.reward), 0) as "reward",
            max(mv.start_time) as "last_posted"
        FROM
            main_hitgroupcontent h
            LEFT JOIN main_requesterprofile p
                ON h.requester_id = p.requester_id
            LEFT JOIN (
                SELECT group_id, hits_posted, start_time
                FROM hits_mv
                WHERE
                    crawl_id > {0} AND
                    hits_posted > 0
            ) mv
                ON h.group_id = mv.group_id
            WHERE
                coalesce(p.is_public, true) = true
            GROUP BY h.requester_id, h.requester_name
            ORDER BY reward desc;""".format(firstcrawl)))
def update_crawl_agregates(commit_threshold=1000, start=None, end=None, clear_existing=False):
    """Creates main_crawlagregates records for hits_mv."""

    def print_status(number, row_id):
        log.info("Commited after %s crawls, last id %s." % (number, row_id))

    clear_existing and start and end and clear_existing_rows(start, end)

    i = 0
    for i, row in enumerate(get_crawls(start=start, end=end)):
        try:
            execute_sql(
                """
            INSERT INTO
                main_crawlagregates (hits, start_time, reward, crawl_id, id,
                    projects, spam_projects)
            SELECT
                sum(hits_available) as "hits",
                start_time,
                sum(reward * hits_available) as "reward",
                crawl_id,
                nextval('main_crawlagregates_id_seq'),
                count(*) as "count",
                count(CASE WHEN is_spam = TRUE then TRUE ELSE NULL END)
            FROM
                (SELECT DISTINCT ON (group_id) * FROM hits_mv
                WHERE crawl_id = %s) AS p
            GROUP BY
                crawl_id, start_time
            """,
                row["id"],
            )

            if i % commit_threshold == 0:
                print_status(i + 1, row["id"])
                transaction.commit_unless_managed()

        except:
            error_info = grab_error(sys.exc_info())
            log.error("an error occured at crawl_id: %s, %s %s" % (row["id"], error_info["type"], error_info["value"]))
            execute_sql("rollback;")

    if i % commit_threshold != 0:
        print_status(i + 1, row["id"])
        transaction.commit_unless_managed()

    # delete dummy data
    execute_sql("DELETE FROM main_crawlagregates WHERE projects < 200;", commit=True)
    transaction.commit_unless_managed()
Exemplo n.º 23
0
def calculate_first_crawl_id():

    progress = 10
    results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")
    log.info('got missing ids results')
    for i, r in enumerate(results):
        log.info("\tprocessing %s" % r['id'])
        execute_sql("""update main_hitgroupcontent p set first_crawl_id =
            (select min(crawl_id) from main_hitgroupstatus where hit_group_content_id = p.id)
            where
                id = %s
        """ % r['id'])

        if i % progress == 0:
            execute_sql('commit;')
            log.info("updated %s main_hitgroupcontent rows with first_crawl_id" % i)

    execute_sql('commit;')
    def handle(self, **options):
        try:
            commit_threshold = 1000
            results = query_to_dicts("select id from main_hitgroupcontent where first_crawl_id is null")

            for i, row in enumerate(results):
                execute_sql("""update main_hitgroupcontent p set first_crawl_id =
                    (select crawl_id from main_hitgroupstatus where group_id = p.group_id order by crawl_id asc LIMIT 1)
                    where
                        id = %s
                """ % row['id'])

                if i % commit_threshold == 0:
                    print datetime.datetime.now(), 'commited after %s rows' % i
                    execute_sql('commit;')
        except:
            error_info = grab_error(sys.exc_info())
            print 'an error occured at: %s line, %s %s' % (i, error_info['type'], error_info['value'])
            execute_sql('rollback;')
Exemplo n.º 25
0
def clean_duplicates():

    ids = query_to_dicts("select group_id from main_hitgroupcontent group by group_id having count(*) > 1;")

    for id in ids:

        log.info("Deleting duplicate group %s" % id['group_id'])

        execute_sql("""delete from main_hitgroupstatus where
                        hit_group_content_id  in (
                            select id from main_hitgroupcontent where id !=
                                (select min(id) from main_hitgroupcontent where group_id = '%s')
                        and group_id = '%s');
        """ % (id['group_id'], id['group_id']))

        execute_sql("""delete from main_hitgroupcontent where
                        id != (select min(id) from main_hitgroupcontent where group_id = '%s') and group_id = '%s'
                    """ % (id['group_id'], id['group_id']))

    execute_sql('commit;')
Exemplo n.º 26
0
    def handle(self, *args, **options):

        if options["clear_all"]:
            logger.info("Removing all existing classification aggregates")
            # HitGroupClassAggregate.objects.all().delete()
            execute_sql('DELETE FROM main_hitgroupclassaggregate;', commit=True)
            return

        begin = options["begin"]
        end = options["end"]

        # If the end of the time interval is not specified compute aggregates
        # up to now.
        if end is None:
            end = datetime.now()
        else:
            end = parse(end)

        # If the begining of the time interval is not specified compute
        # from last existing aggregates.
        if begin is None:
            aggregate = HitGroupClassAggregate.objects.aggregate(Max('start_time'))
            start_time = aggregate['start_time__max']
            if start_time is None:
                aggregate = Crawl.objects.aggregate(Min('start_time'))
                start_time = aggregate['start_time__min'] + timedelta(seconds=1)
            else:
                start_time += timedelta(seconds=1)
            if start_time is not None:
                begin = start_time
            else:
                logger.warn("No crawls for processing")
                return
        else:
            begin = parse(begin)

        begin = begin.replace(tzinfo=None)
        end = end.replace(tzinfo=None)

        queryset = HitGroupClassAggregate.objects.filter(start_time__gte=begin,
                                                         start_time__lt=end)
        if options["clear_existing"]:
            queryset.delete()

        if queryset.exists():
            logger.error("Some of aggreagates are already calucated for a "
                         "given time interval")
            return

        logger.info("Calculating hit group class aggregates from {} to {} "
                    "with 1-hour intervals"
                    .format(begin, end, ))

        chunk_begin = begin
        chunk_end = begin + timedelta(hours=1)

        query_template = """
                INSERT INTO
                    main_hitgroupclassaggregate
                    (
                        crawl_id,
                        start_time,
                        classes,
                        hits_available
                    )
                SELECT
                    crawl_id,
                    start_time,
                    classes,
                    sum(hits_available) as hits_available
                FROM hits_mv
                JOIN main_hitgroupclass
                ON hits_mv.group_id = main_hitgroupclass.group_id
                WHERE
                    start_time >= '{}' AND
                    start_time < '{}'
                GROUP BY
                    crawl_id,
                    start_time,
                    classes;
            """
        while (chunk_end < end):
            logger.info("Processing chunk of crawls from {} to {}"
                        .format(chunk_begin, chunk_end))
            query = query_template.format(chunk_begin, chunk_end)
            execute_sql(query, commit=True)
            chunk_begin = chunk_end
            chunk_end += timedelta(hours=1)

        if chunk_begin < end:
            query = query_template.format(chunk_begin, end)
            execute_sql(query, commit=True)

        logger.info("Hit group class aggregates are made")
Exemplo n.º 27
0
    def handle(self, **options):
        '''
        Sample data:
groupid | hit_title | requester_name | requester_id | description | keywords | qualifications | hit_expiration_date | time_allotted | reward | hits_available | time_crawled | crawl_id | pageNo | inpage_position | dollars
CW18RQ8BPZWWMWY3DY4Z | Validate the addresses. Great bonus  | Product Search | A28JEQTWH76JDT | Given the list of addresses and it's site URL, please verify whether the given addresses exists in the site or not. [ Note: Bonus and Reward Details: 1). For each validated address you will get $0.02 as bonus. Suppose if the input file contains 10 addresses and you have validated all of them then you will get $0.2(i.e., 10*0.02) as bonus + $0.01 as reward. ] | reward,  online,  shopping,  web,  quality,  testing,  relevance,  search,  engine,  US,  India,  browse,  hit  | HIT approval rate (%) is not less than 80  | Aug 26, 2009  (33 weeks)  | 1 day 12 hours | $0.01 | 295 | 2009-01-07 14:00:05 | 1 | 2 | 5 | 0.01

HitGroupContent:
    group_id            = models.CharField('Group ID', max_length=50, db_index=True, unique=True)
    group_id_hashed     = models.BooleanField(default=False)
    requester_id        = models.CharField('Requester ID', max_length=50, db_index=True)
    requester_name      = models.CharField('Requester Name', max_length=500)
    reward              = models.FloatField('Reward')
    html                = models.TextField('HTML', max_length=100000000)
    description         = models.TextField('Description', max_length=1000000)
    title               = models.CharField('Title', max_length=500)
    keywords            = models.CharField('Keywords', blank=True, max_length=500, null=True)
    qualifications      = models.CharField('Qualifications', blank=True, max_length=500, null=True)
    occurrence_date     = models.DateTimeField('First occurrence date', blank=True, null=True, db_index=True)
    time_alloted        = models.IntegerField('Time alloted')
    first_crawl         = models.ForeignKey(Crawl, blank=True, null=True)

HitGroupStatus

    group_id            = models.CharField('Group ID',max_length=50, db_index=True)
    hits_available      = models.IntegerField('Hits Avaliable')
    page_number         = models.IntegerField('Page Number')
    inpage_position     = models.IntegerField('In Page Position')
    hit_expiration_date = models.DateTimeField('Hit expiration Date')

    hit_group_content   = models.ForeignKey(HitGroupContent)

    crawl               = models.ForeignKey(Crawl)

    '''

        items_per_transaction = 1000
        transaction_count = 0
        i = 0
        hit_group_content_mapping = {}
        crawl_mapping = {}

        print 'setting up crawl mappings'
        crawls = Crawl.objects.all().values_list('old_id','pk')
        for row in crawls:
            crawl_mapping[row[0]] = row[1]

        del crawls

        try:
            i = get_counter('import_hitgroups_line')
        except:
            pass

        try:
            f = open(os.path.join(settings.ROOT_PATH,'data','hits.utf8.csv'),"rb")
            error_log = open(os.path.join(settings.ROOT_PATH,'data','error.hits.utf8.csv'),'w')

            '''
            seek to file_position stored in counter
            '''
            try:
                last_position = get_counter('import_hitgroups')
                print 'starting from: %s' % last_position
                f.seek(int(last_position))
            except:
                f.readline()
                print 'coulnd not find last position starting from first line'
                pass

            transaction.enter_transaction_management()
            transaction.managed(True)

            for row in f:
                try:
                    row = row.strip()
                    group_id, title, requster_name, requester_id, description, keywords, qualifications, hit_expiration_date, time_alloted, reward, hits_available, time_crawled, crawl_id, page_no, inpage_position, dollars =  tuple(row.split('|'))

                    '''
                    check if there already is a HitGroupContent for this row
                    if HitGroupContent exists do nothin
                    '''

                    reward = float(reward[1:]) # stripiing starting $ ex. $0.1
                    time_alloted = parse_time_alloted(time_alloted) # parsing strings like 4 hours 30 minutes to int minutes
                    crawl_id = int(crawl_id)
                    hits_available = int(hits_available)
                    page_no = int(page_no)
                    inpage_position = int(inpage_position)
                    hashed_group_id = False

                    if group_id == '':
                        group_id = hashlib.md5("%s;%s;%s;%s;%s;%s;%s;" % (title, requester_id,
                                                                         time_alloted,reward,
                                                                         description,keywords,
                                                                         qualifications)).hexdigest()
                        hashed_group_id = True


                    hit_expiration_date = datetime.datetime.strptime(re.sub('\(.*\)', '', hit_expiration_date).strip(), "%b %d, %Y") # Apr 5, 2009  (4 weeks 1 day)

                    exists = False
                    content_id = execute_sql("select id from main_hitgroupcontent where group_id = '%s'" % group_id).fetchone()
                    if content_id is not None:
                        hit_group_content_mapping[group_id] = content_id[0]
                        exists = True

                    if not exists:
                        '''
                        if not: save new HitGroupContent object store mapping in memmory
                        '''
                        obj = HitGroupContent(group_id_hashed = hashed_group_id, group_id=group_id, requester_id = requester_id, requester_name = requster_name, reward = reward, description = description, title = title, keywords = keywords, qualifications = qualifications, time_alloted = time_alloted, occurrence_date = time_crawled )
                        obj.save()
                        hit_group_content_mapping[group_id] = obj.pk

                    '''
                    store hitgroupstatus into db with correct mapping to HitGroupContent
                    '''
                    obj = HitGroupStatus(group_id = group_id, hits_available = hits_available, page_number = page_no, inpage_position = inpage_position, hit_expiration_date = hit_expiration_date, hit_group_content_id = hit_group_content_mapping[group_id], crawl_id = crawl_mapping[crawl_id])
                    obj.save()

                except (ValueError, KeyError):
                    error_info = grab_error(sys.exc_info())
                    error_log.write(row)
                    error_log.write("\r\n")
                    print 'an error occured at: %s line, %s %s' % (i,error_info['type'], error_info['value'])

                i += 1

                '''
                do a transaction per items_per_transaction rows
                when commiting transaction write file position and next crawl_id to counter file
                '''
                if i % items_per_transaction == 0:
                    transaction.commit()
                    transaction_count += 1
                    write_counter('import_hitgroups', f.tell())
                    write_counter('import_hitgroups_line', i)
                    print 'commited %s transaction, line: %s' % (transaction_count, i)

        except KeyboardInterrupt:
            '''
            User stopped script, rollback last data, close file descriptors  exit

            '''
            transaction.rollback()
            error_log.close()
            f.close()
            exit()
Exemplo n.º 28
0
def last_crawlids(limit=10):
    r = execute_sql("select crawl_id from hits_mv order by crawl_id desc limit %s", limit)
    return [c[0] for c in r.fetchall()]
Exemplo n.º 29
0
def last_crawlid():
    return execute_sql("select crawl_id from hits_mv order by crawl_id desc limit 1;").fetchall()[0][0]
Exemplo n.º 30
0
def hitgroups(cid):
    r = execute_sql("select distinct group_id from hits_mv where crawl_id = %s", cid)
    return [g[0] for g in r.fetchall()]