def handle(self, **options):

        self.start_time = time.time()

        pid = Pid('remove_bad_crawl_related', True)

        self.having_hits_mv = not options.get('all')
        self.chunk_size = options.get('chunk-size')
        self.chunked = not options.get('simple')
        self.limit = options.get('limit')
        self.fix_interrupted = options.get('fix-interrupted')

        self.crawl_count = self.get_crawls_count()
        if options.get('count-only') or self.crawl_count == 0:
            self.handle_count_only()

        self.fix_interrupted and self.update_interrupted_crawl_stats()

        # if limit is specified, show X/Y instead of just Y
        log.info('Starting bad crawl related data removal, {0}{1} records will '
            'be processed.'.format(
                '{0}/'.format(self.limit) if self.limit else '',
                self.crawl_count))

        ids = self.get_crawl_ids()
        deleted = self.do_deletes(ids)

        log.info('Command took: {0}, {1} crawls processed.'.format(
            self.time_elapsed(), deleted))

        pid.remove_pid()
    def handle(self, **options):

        self.process_options(options)

        pid = Pid(self.pidfile, True)
        start_time = time.time()

        try:

            cur = connection.cursor()

            self.logger.info('Calling {0}({1}, {2}), start time: {3}.'.format(
                self.proc_name, self.start, self.end, now()))

            cur.callproc(self.proc_name, self.get_proc_args())

            transaction.commit_unless_managed()

            self.logger.info('{0} for crawls from {1} to {2} took: {3}.'.format(
                self.proc_name, self.start, self.end, time.time() - start_time))

        except Exception as e:
            self.logger.exception(e)
        finally:
            pid.remove_pid()
            if options.get('verbosity') == 0:
                self.logger.setLevel(logging.DEBUG)
    def handle(self, **options):

        pid = Pid('mturk_agregates', True)
        start_time = time.time()

        log.info('Updating crawl agregates')
        update_crawl_agregates(only_new=True)

        log.info('db_update_agregates took: %s' % (time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid("mturk_aggregates", True)

        self.process_options(options)
        start_time = time.time()

        log.info("Updating crawl agregates")
        update_crawl_agregates(start=self.start, end=self.end, clear_existing=self.clear_existing)
        log.info("db_update_agregates took: %s" % (time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):
        """Main command entry point."""

        self.options = options

        if self.options['list']:
            pass  # do nothing, go straight to print_status
        else:
            pid = Pid(self.options.get('pidfile'), True)
            self.prepare_options()  # sets self.reports and prints errors is any
            self.reports and (self.handle_purge() or self.handle_cache())
            pid.remove_pid()

        self.print_status()
    def handle(self, **options):

        pid = Pid('mtur_aggregates')

        self.process_options(options)
        start_time = time.time()

        log.info('Refreshing hits_mv')
        update_mviews(clear_existing=self.clear_existing, force=self.force,
            start=self.start, end=self.end)
        log.info('Done refreshing hits_mv db_refresh_mviews took: {0}s.'.format(
            time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid('mturk_crawler', True)

        start_time = time.time()

        log.info('Removing duplicate hitgroupcontent and hitgroupstatuses.')
        clean_duplicates()

        log.info('Refreshing hits_mv')
        update_mviews()

        log.info('Done refreshing hits_mv')

        log.info('db_refresh_mviews took: %s' % (time.time() - start_time))

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid('mturk_cache_topreq', True)

        report_type = options.get('report-type')
        if report_type not in ToprequestersReport.values:
            log.info('Unknown report type: "{0}".'.format(report_type))
            return

        key = ToprequestersReport.get_cache_key(report_type)
        display_name = ToprequestersReport.display_names[report_type]

        if cache.get(key) is None:
            log.info(('"{0}" toprequesters report missing, recalculating.'
                ).format(display_name))
        else:
            if options['force']:
                log.info('Recalculating "{0}" toprequesters report.'.format(
                    display_name))
            else:
                log.info('"{0}" toprequesters still in cache, use --force flag'
                    ' to rebuild anyway.'.format(display_name))
                return

        days = options['days']
        # no chache perform query:
        start_time = time.time()
        data = ToprequestersReport.REPORT_FUNCTION[report_type](days)
        log.info('Toprequesters report "{0}" generated in: {1}s.'.format(
            display_name, time.time() - start_time))

        # too often we get no information on the success of caching
        if not data:
            log.warning('Data returned by report function is {0}!'.format(data))
        else:
            cache.set(key, data, HOURS4)
            in_cache = cache.get(key, data)
            if in_cache is None:
                log.warning('Cache error - data could not be fetched!')

        pid.remove_pid()
    def handle(self, **options):

        pid = Pid('mturk_diffs', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            items = Crawl.objects.filter(is_spam_computed=False
                ).order_by('-id')[:options['limit']]
            lenitems = len(items)

            log.info(('Starting db_update_diffs, {0} crawls will be updated.'
                ).format(lenitems))

            for c in items:

                updated = update_cid(c.id)

                if updated > 0:
                    c.has_diffs = True
                    c.save()

                transaction.commit()

        except (KeyError, KeyboardInterrupt) as e:
            log.info(('Exception, rolling back the transaction and exiting: {0}'
                ).format(e))
            transaction.rollback()
            pid.remove_pid()
            exit()

        log.info('Success! Updating {0} crawls took: {1} s'.format(
            lenitems, time.time() - start_time))
示例#10
0
    def handle(self, *args, **options):

        self.mturk_email = getattr(settings, 'MTURK_AUTH_EMAIL', None)
        self.mturk_password = getattr(settings, 'MTURK_AUTH_PASSWORD', None)

        _start_time = time.time()
        pid = Pid('mturk_crawler', True)
        log.info('crawler started: %s;;%s', args, options)

        if options.get('mturk_email'):
            self.mturk_email = options['mturk_email']
        if options.get('mturk_password'):
            self.mturk_password = options['mturk_password']

        if options.get('logconf', None):
            self.setup_logging(options['logconf'])

        if options.get('debug', False):
            self.setup_debug()
            print 'Current proccess pid: %s' % pid.actual_pid
            print ('To debug, type: python -c "import os,signal; '
                'os.kill(%s, signal.SIGUSR1)"\n') % pid.actual_pid

        self.maxworkers = options['workers']
        if self.maxworkers > 9:
            # If you want to remote this limit, don't forget to change dbpool
            # object maximum number of connections. Each worker should fetch
            # 10 hitgroups and spawn single task for every one of them, that
            # will get private connection instance. So for 9 workers it's
            # already 9x10 = 90 connections required
            #
            # Also, for too many workers, amazon isn't returning valid data
            # and retrying takes much longer than using smaller amount of
            # workers
            sys.exit('Too many workers (more than 9). Quit.')
        start_time = datetime.datetime.now()

        hits_available = tasks.hits_mainpage_total()
        groups_available = tasks.hits_groups_total()

        # create crawl object that will be filled with data later
        crawl = Crawl.objects.create(
                start_time=start_time,
                end_time=start_time,
                success=True,
                hits_available=hits_available,
                hits_downloaded=0,
                groups_available=groups_available,
                groups_downloaded=groups_available)
        log.debug('fresh crawl object created: %s', crawl.id)

        # fetch those requester profiles so we could decide if their hitgroups
        # are public or not
        reqesters = RequesterProfile.objects.all_as_dict()

        dbpool = ThreadedConnectionPool(10, 90,
            'dbname=%s user=%s password=%s' % (
                settings.DATABASES['default']['NAME'],
                settings.DATABASES['default']['USER'],
                settings.DATABASES['default']['PASSWORD']))
        # collection of group_ids that were already processed - this should
        # protect us from duplicating data
        processed_groups = set()
        total_reward = 0
        hitgroups_iter = self.hits_iter()

        for hg_pack in hitgroups_iter:
            jobs = []
            for hg in hg_pack:
                if hg['group_id'] in processed_groups:
                    log.debug('Group already in processed_groups, skipping.')
                    continue
                processed_groups.add(hg['group_id'])

                j = gevent.spawn(tasks.process_group,
                        hg, crawl.id, reqesters, processed_groups, dbpool)
                jobs.append(j)
                total_reward += hg['reward'] * hg['hits_available']
            log.debug('processing pack of hitgroups objects')
            gevent.joinall(
                jobs, timeout=settings.CRAWLER_GROUP_PROCESSING_TIMEOUT)
            # check if all jobs ended successfully
            for job in jobs:
                if not job.ready():
                    log.error('Killing job: %s', job)
                    job.kill()

            if len(processed_groups) >= groups_available:
                log.info('Skipping empty groups.')
                # there's no need to iterate over empty groups.. break
                break

            # amazon does not like too many requests at once, so give them a
            # quick rest...
            gevent.sleep(1)

        dbpool.closeall()

        # update crawler object
        crawl.groups_downloaded = len(processed_groups)
        crawl.end_time = datetime.datetime.now()
        crawl.save()

        work_time = time.time() - _start_time
        log.info("""Crawl finished:
        created crawl id: {crawl_id})
        total reward value: {total_reward}
        hits groups downloaded: {processed_groups}
        hits groups available: {groups_available}
        work time: {work_time:.2f} seconds
        """.format(crawl_id=crawl.id, total_reward=total_reward,
            processed_groups=len(processed_groups),
            groups_available=groups_available,
            work_time=work_time))

        crawl_downloaded_pc = settings.INCOMPLETE_CRAWL_THRESHOLD
        crawl_warning_pc = settings.INCOMPLETE_CRAWL_WARNING_THRESHOLD
        crawl_time_warning = settings.CRAWLER_TIME_WARNING
        downloaded_pc = float(crawl.groups_downloaded) / groups_available
        if work_time > crawl_time_warning:
            log.warning(("Crawl took {0}s which seems a bit too long (more "
                "than {1}s), you might consider checking if correct mturk "
                "account is used, ignore this if high number of groups is "
                "experienced.").format(work_time, crawl_time_warning))
        if downloaded_pc < crawl_warning_pc:
            log.warning(('Only {0}% of hit groups were downloaded, below '
                '({1}% warning threshold) please check mturk account '
                'configuration and/or if there are any network-related '
                'problems.').format(downloaded_pc, crawl_warning_pc))
        if downloaded_pc < crawl_downloaded_pc:
            log.warning("This crawl contains far too few groups downloaded to "
                "available: {0}% < {1}% downloaded threshold and will be "
                "considered as erroneous ({2}/{3} groups).".format(
                    downloaded_pc, crawl_downloaded_pc,
                    crawl.groups_downloaded, groups_available))

        pid.remove_pid()
示例#11
0
    def run(self):

        pid = Pid('mturk_crawler', True)

        log.info('Crawler started')

        start_time = datetime.datetime.now()

        #Fetching statistical information about groups and HITs count
        log.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        log.debug("Allhit processing")
        result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit,
                                            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        log.debug("Details processing")
        result_details = self.process_values(self.data, callback_details,
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])

        hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5:
            success = True

        log.debug("Crawl finished with success=%s. Saving main_crawl entry" % success)
        crawl = Crawl(**{
            'start_time':           start_time,
            'end_time':             datetime.datetime.now(),
            'success':              success,
            'hits_available':       hits_available,
            'hits_downloaded':      hits_downloaded,
            'groups_available':     groups_available,
            'groups_downloaded':    groups_downloaded,
            #'errors':               str(self.errors) # !
            'errors':               ''
        })
        crawl.save()

        #Adding crawl FK
        log.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk,
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        log.debug("Saving results")
        result_save_database = self.process_values(self.data, callback_database)
        self.append_errors(result_save_database['errors'])

        print self.errors

        log.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % (
                "" if success else "un",
                (datetime.datetime.now()-start_time),
                groups_downloaded,
                hits_downloaded,
                groups_available,
                hits_available,
                len(self.errors)
            )
        )

        pid.remove_pid()
示例#12
0
    def handle(self, **options):

        pid = Pid(self.pid_file) if self.pid_file else None

        self.start_time = time.time()
        self.process_options(options)

        try:

            self.prepare_data()

            # query crawls in the period we want to process
            crawls = self.get_crawls()
            self.total_count = len(crawls)
            if self.total_count < self.min_crawls:
                self.log.info("Not enough crawls to process.")
                return

            done = 0
            self.log.info("""
            Starting {6}.

            {0} crawls will be processed in chunks of {3} (overlap: {7}).
            -- {1} to
            -- {2},
            id from {4} to {5}.
            """.format(self.total_count,
                self.start.strftime('%y-%m-%d %H:%M:%S'),
                self.end.strftime('%y-%m-%d %H:%M:%S'),
                self.chunk_size,
                crawls[0].id, crawls[self.total_count - 1].id,
                self.display_name, self.overlap))

            # iterate over overlapping chunks of crawls list
            for chunk in self.chunks(crawls, self.chunk_size,
                    overlap=self.overlap):

                start, end = (chunk[-1].start_time, chunk[0].start_time)
                self.log.info(('Chunk of {0} crawls: {1}\nstart_time {2} to '
                    '{3}.').format(len(chunk), [c.id for c in chunk],
                    start.strftime('%y-%m-%d %H:%M:%S'),
                    end.strftime('%y-%m-%d %H:%M:%S')))
                chunk_time = time.time()

                if not self.process_chunk(start, end, chunk):
                    break

                chunk_time = time.time() - chunk_time
                self.store_chunk_time(chunk_time)
                done += len(chunk) - self.overlap
                self.log.info(('\n chunk {0} \n total {1} '
                    '\n ETA   {4}, {2}/{3} done, ').format(
                    humanized_time(chunk_time),
                    humanized_time(self.get_elapsed()),
                    done, self.total_count - self.overlap,
                    humanized_time(self.get_eta())))

        except Exception as e:
            self.log.exception(e)
        else:
            self.log.info('{0} crawls processed in {1}s, exiting.'.format(
                self.total_count, self.get_elapsed()))
        finally:
            pid and pid.remove_pid()
示例#13
0
    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv
                    where
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates
                    set spam_projects =
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()