Python Pid.remove_pid 예제들, tenclouds.pid.Pid.remove_pid Python 예제들

예제 #1

0

파일 보기

    def handle(self, **options):

        pid = Pid('mturk_diffs', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            for c in Crawl.objects.filter(
                    is_spam_computed=False).order_by('-id')[:options['limit']]:

                updated = update_cid(c.id)

                if updated > 0:
                    c.has_diffs = True
                    c.save()

                transaction.commit()

        except (KeyError, KeyboardInterrupt):
            transaction.rollback()
            pid.remove_pid()
            exit()

        logger.info('updating 5 crawls took: %s s', (time.time() - start_time))

예제 #2

0

파일 보기

파일: db_update_diffs.py 프로젝트: faridani/Mturk-Tracker

    def handle(self, **options):

        pid = Pid('mturk_diffs', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            for c in Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]:
                
                updated = update_cid(c.id)
                
                if updated > 0:
                    c.has_diffs=True
                    c.save()

                transaction.commit()

        except (KeyError, KeyboardInterrupt):
            transaction.rollback()
            pid.remove_pid()
            exit()            

        logger.info('updating 5 crawls took: %s s', (time.time() - start_time))

예제 #3

0

파일 보기

파일: db_update_agregates.py 프로젝트: faridani/Mturk-Tracker

 def handle(self, **options):
     
     pid = Pid('mturk_agregates', True)
     start_time = time.time()
     
     logging.info('Updating crawl agregates')
     update_crawl_agregates(1, only_new = True)
     
     logging.info('db_update_agregates took: %s' % (time.time() - start_time))
     
     pid.remove_pid()

예제 #4

0

파일 보기

    def handle(self, **options):

        pid = Pid('mturk_agregates', True)
        start_time = time.time()

        logging.info('Updating crawl agregates')
        update_crawl_agregates(1, only_new=True)

        logging.info('db_update_agregates took: %s' %
                     (time.time() - start_time))

        pid.remove_pid()

예제 #5

0

파일 보기

파일: db_refresh_mviews.py 프로젝트: faridani/Mturk-Tracker

    def handle(self, **options):

        pid = Pid('mturk_crawler', True)

        start_time = time.time()

        logging.info('cleaning up db from duplicates')
        clean_duplicates()

        logging.info('Refreshing hits_mv')
        update_mviews()

        logging.info('done refreshing hits_mv')

        logging.info('db_refresh_mviews took: %s' % (time.time() - start_time))

        pid.remove_pid()

예제 #6

0

파일 보기

파일: cache_toprequesters.py 프로젝트: faridani/Mturk-Tracker

    def handle(self, **options):

        pid = Pid('mturk_agregates', True)

        key = 'TOPREQUESTERS_CACHED'
        
        result = cache.get(key)
        if result is not None:
           logging.info("toprequesters still in cache...")
           return
        days = options['days']

        logging.info("toprequesters missing, refetching")
        # no chache perform query:
        
        from mturk.main.views import topreq_data
        start_time = time.time()
        data = topreq_data(days)
        logging.info("toprequesters: filled memcache in %s", time.time() - start_time)
        cache.set(key, data, HOURS4)

        pid.remove_pid()

예제 #7

0

파일 보기

    def handle(self, **options):

        pid = Pid('mturk_agregates', True)

        key = 'TOPREQUESTERS_CACHED'

        result = cache.get(key)
        if result is not None:
            logging.info("toprequesters still in cache...")
            return
        days = options['days']

        logging.info("toprequesters missing, refetching")
        # no chache perform query:

        from mturk.main.views import topreq_data
        start_time = time.time()
        data = topreq_data(days)
        logging.info("toprequesters: filled memcache in %s",
                     time.time() - start_time)
        cache.set(key, data, HOURS4)

        pid.remove_pid()

예제 #8

0

파일 보기

파일: classify_spam.py 프로젝트: faridani/Mturk-Tracker

    def handle(self, **options):

        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(Crawl.objects.filter(is_spam_computed=False).order_by('-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])
                
                updated = 0

                for row in query_to_dicts("""select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id,row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(id= row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(body=body, data=options['file']).execute()
                            
                            number_of_predictions += 1
                            updated += 1                    
                            
                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql("update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'" % ('true' if content.is_spam else 'false', c.id, row['group_id']))       
                        transaction.commit()
                            
                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)
                
                if updated > 0:
                    c.is_spam_computed=True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id) ) 


                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()

예제 #9

0

파일 보기

파일: crawler_.py 프로젝트: faridani/Mturk-Tracker

    def run(self):

        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()

        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(
            main_html,
            parseOnlyThese=SoupStrainer(
                text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.
                                                    find(' ')].replace(
                                                        ',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[
                groups_available_tmp.find('of') +
                3:groups_available_tmp.find('Results') - 1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(
            range(1,
                  self.get_max_page(main_html) + 1), callback_allhit,
            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details,
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])

        hits_downloaded = sum(
            [hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available / groups_downloaded <= 1.5 and hits_available / hits_downloaded <= 1.5:
            success = True

        logging.debug(
            "Crawl finished with success=%s. Saving main_crawl entry" %
            success)
        crawl = Crawl(
            **{
                'start_time': start_time,
                'end_time': datetime.datetime.now(),
                'success': success,
                'hits_available': hits_available,
                'hits_downloaded': hits_downloaded,
                'groups_available': groups_available,
                'groups_downloaded': groups_downloaded,
                #'errors':               str(self.errors) # !
                'errors': ''
            })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data,
                                                 callback_add_crawlfk,
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data,
                                                   callback_database)
        self.append_errors(result_save_database['errors'])

        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors"
            % ("" if success else "un", (datetime.datetime.now() - start_time),
               groups_downloaded, hits_downloaded, groups_available,
               hits_available, len(self.errors)))

        pid.remove_pid()

예제 #10

0

파일 보기

파일: crawler_.py 프로젝트: faridani/Mturk-Tracker

    def run(self):
        
        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()
        
        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, 
                                            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details, 
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])
        
        hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5:
            success = True
        
        logging.debug("Crawl finished with success=%s. Saving main_crawl entry" % success)
        crawl = Crawl(**{
            'start_time':           start_time,
            'end_time':             datetime.datetime.now(),
            'success':              success,
            'hits_available':       hits_available,
            'hits_downloaded':      hits_downloaded,
            'groups_available':     groups_available,
            'groups_downloaded':    groups_downloaded,
            #'errors':               str(self.errors) # !
            'errors':               ''
        })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, 
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data, callback_database)
        self.append_errors(result_save_database['errors'])
        
        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % (
                "" if success else "un",
                (datetime.datetime.now()-start_time),
                groups_downloaded,
                hits_downloaded,
                groups_available,
                hits_available,
                len(self.errors)
            )
        )
        
        pid.remove_pid()

예제 #11

0

파일 보기

    def handle(self, **options):
        """
        Take ${lmit} last crawls without spam classification
        Classify all hit groups, update hits_mv to have proper hit classification
        Rebuild crawl_aggregates for a given crawl
        Refresh memcache
        """

        service = get_prediction_service()

        pid = Pid('classify_spam', True)

        transaction.enter_transaction_management()
        transaction.managed(True)

        start_time = time.time()

        try:

            number_of_predictions = 0

            for c in list(
                    Crawl.objects.filter(is_spam_computed=False).order_by(
                        '-id')[:options['limit']]):

                log.info("processing %s", c)

                spam = set([])
                not_spam = set([])

                updated = 0

                for row in query_to_dicts(
                        """select content_id, group_id, is_spam from hits_mv 
                    where 
                        crawl_id = %s""", c.id):

                    log.info("classyfing crawl_id: %s, %s", c.id, row)

                    if row['is_spam'] is None:

                        is_spam = None
                        content = HitGroupContent.objects.get(
                            id=row['content_id'])

                        if content.is_spam is None:
                            data = content.prepare_for_prediction()

                            body = {'input': {'csvInstance': data}}
                            prediction = service.predict(
                                body=body, data=options['file']).execute()

                            number_of_predictions += 1
                            updated += 1

                            content.is_spam = prediction['outputLabel'] != 'No'
                            content.save()

                        execute_sql(
                            "update hits_mv set is_spam = %s where crawl_id = %s and group_id = '%s'"
                            % ('true' if content.is_spam else 'false', c.id,
                               row['group_id']))
                        transaction.commit()

                        if content.is_spam:
                            log.info("detected spam for %s", row)
                            spam.add(str(row['content_id']))
                        else:
                            not_spam.add(str(row['content_id']))

                    else:
                        log.info("is_spam already computed for %s" % row)

                if updated > 0:
                    c.is_spam_computed = True
                    c.save()

                log.info("done classyfing crawl")

                execute_sql("""UPDATE main_crawlagregates 
                    set spam_projects = 
                        ( select count(*) from hits_mv where crawl_id = %s and is_spam = true )
                    where crawl_id = %s""" % (c.id, c.id))

                transaction.commit()

                log.info("dome processing %s", c)

        except (KeyError, KeyboardInterrupt, HttpError), e:
            log.error(e)
            transaction.rollback()
            pid.remove_pid()
            exit()