示例#1
0
 def get_source_worker(self):
     """
         Returns a worker who has sent this sample.
     """
     return Sample.get_worker(
         source_type=self.source_type,
         source_val=self.source_val,
     )
示例#2
0
    def setUp(self):
        self.u = User.objects.create_user(username='******', password='******')

        self.job = Job.objects.create_active(
            account=self.u.get_profile(),
            gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}])

        self.train_data = [
            Sample(job=self.job, source_type='',
                text='Mechanical squirrel screwdriver over car'),
            Sample(job=self.job, source_type='',
                text='Screwdriver fix mechanical bike bolts'),
            Sample(job=self.job, source_type='',
                text='Brown banana apple pinapple potato'),
            Sample(job=self.job, source_type='',
                text='apple pinapple potato'),
            Sample(job=self.job, source_type='',
                text='Hippo tree over lagoon'),
            Sample(job=self.job, source_type='',
                text='Green tan with true fox')
        ]
        self.labels = [LABEL_YES, LABEL_YES, LABEL_NO, LABEL_NO, LABEL_NO, LABEL_NO]
        self.classified = []
        for idx, sample in enumerate(self.train_data):
            self.classified.append(ClassifiedSample.objects.create(
                job=self.job,
                sample=sample,
                label=self.labels[idx]
            ))

        self.classifier247 = classifier_factory.create_classifier(
            job_id=self.job.id,
        )
示例#3
0
def create_classify_sample(result, source_type, create_classified=True,
        label='', source_val='', *args, **kwargs):
    """
        Creates classified sample from existing sample, therefore we don't need
        web extraction.
    """

    # We are given a tuple (extraction result, sample id)
    extraction_result = result[0]

    # If extraction failed - return
    if not extraction_result:
        return False
    sample_id = result[1]

    # Don't classify already classified samples
    if label:
        return sample_id

    if create_classified:
        try:
            sample = Sample.objects.get(id=sample_id)

            if not label:
                label = ''

            # Proper sample entry
            class_sample = ClassifiedSample.objects.create(
                job=sample.job,
                url=sample.url,
                sample=sample,
                label=label,
                source_type=source_type,
                source_val=source_val,
            )

            worker = Sample.get_worker(source_type=source_type,
                    source_val=source_val)
            if worker:
                # Update cache
                worker.get_urls_collected_count_for_job(sample.job, cache=False)

            # Sample created sucesfully - pushing event.
            send_event(
                "EventNewClassifySample",
                sample_id=class_sample.id,
            )

        except DatabaseError, e:
            # Retry process on db error, such as 'Database is locked'
            create_classify_sample.retry(exc=e,
                countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..."
        for sample in orm["main.Sample"].objects.filter(goldsample__isnull=True):
            worker = Sample.get_worker(source_type=sample.source_type, source_val=sample.source_val)
            if not worker:
                continue

            try:
                orm["crowdsourcing.WorkerQualityVote"].objects.new_vote(worker=worker, sample=sample, label=LABEL_YES)
            except:
                # Such vote already exists - skip.
                pass
示例#5
0
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..."
        for sample in orm['main.Sample'].objects.filter(goldsample__isnull=True):
            worker = Sample.get_worker(
                source_type=sample.source_type,
                source_val=sample.source_val,
            )
            if not worker:
                continue

            try:
                orm['crowdsourcing.WorkerQualityVote'].objects.new_vote(
                    worker=worker,
                    sample=sample,
                    label=LABEL_YES,
                )
            except:
                # Such vote already exists - skip.
                pass
示例#6
0
    def testVerifyFromTagasauris(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]),
            same_domain_allowed=2,
            no_of_urls=10,
        )

        worker_id = '1234'

        # Verifying first url (and adding)
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 1)

        # This time verification should fail becaufe of too many urls from same
        # domain
        newest_url = 'google.com/2'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('domain duplicate', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 0)

        # This time verification should fail becaufe of duplicated url (look at
        # golden sample)
        newest_url = 'google.com'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('duplicate', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 1)
示例#7
0
    def testVerifyFromTagasaurisErrors(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]),
            same_domain_allowed=2,
            no_of_urls=10,
        )

        worker_id = '1234'

        # Error on not existing job.
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, 1234567), json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(Sample.objects.filter(
            url=Sample.sanitize_url(newest_url)).count(), 0)

        # Error on wrong post data (not json).
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), data)
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(Sample.objects.filter(
            url=Sample.sanitize_url(newest_url)).count(), 0)

        # Error on wrong post data (parameters errors).
        newest_url = 'google.com/1'
        data = {
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(Sample.objects.filter(
            url=Sample.sanitize_url(newest_url)).count(), 0)

        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(Sample.objects.filter(
            url=Sample.sanitize_url(newest_url)).count(), 0)
示例#8
0
    def testVerifyFromTagasaurisLimit(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]),
            same_domain_allowed=20,
            no_of_urls=2,
        )

        worker_id = '1234'

        # Verifying first url (and adding). We need one more.
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 1)

        # Verifying second url. Gathering should be completed.
        newest_url = 'google.com/2'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(True, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 1)

        self.assertEqual(job.get_urls_collected(), job.no_of_urls)

        # Verifying third url. Gathering should be completed but url won't be
        # added.
        newest_url = 'google.com/3'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post('%ssample/add/tagasauris/%s/?format=json'
            % (self.api_url, job.id), json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('', resp_dict['result'])
        self.assertEqual(True, resp_dict['all'])

        self.assertEqual(Sample.objects.filter(
            job=job, url=Sample.sanitize_url(newest_url)).count(), 0)

        self.assertEqual(job.get_urls_collected(), job.no_of_urls)
示例#9
0
    def testVerifyFromTagasauris(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{
                'url': 'google.com',
                'label': LABEL_YES
            }]),
            same_domain_allowed=2,
            no_of_urls=10,
        )

        worker_id = '1234'

        # Verifying first url (and adding)
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            1)

        # This time verification should fail becaufe of too many urls from same
        # domain
        newest_url = 'google.com/2'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('domain duplicate', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            0)

        # This time verification should fail becaufe of duplicated url (look at
        # golden sample)
        newest_url = 'google.com'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('duplicate', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            1)
示例#10
0
    def testVerifyFromTagasaurisErrors(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{
                'url': 'google.com',
                'label': LABEL_YES
            }]),
            same_domain_allowed=2,
            no_of_urls=10,
        )

        worker_id = '1234'

        # Error on not existing job.
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' %
            (self.api_url, 1234567), json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(
            Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(),
            0)

        # Error on wrong post data (not json).
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            data)
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(
            Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(),
            0)

        # Error on wrong post data (parameters errors).
        newest_url = 'google.com/1'
        data = {
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(
            Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(),
            0)

        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertNotEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('error' in resp_dict.keys())
        self.assertEqual(
            Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(),
            0)
示例#11
0
    def testVerifyFromTagasaurisLimit(self):
        job = Job.objects.create_active(
            account=self.user.get_profile(),
            gold_samples=json.dumps([{
                'url': 'google.com',
                'label': LABEL_YES
            }]),
            same_domain_allowed=20,
            no_of_urls=2,
        )

        worker_id = '1234'

        # Verifying first url (and adding). We need one more.
        newest_url = 'google.com/1'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(False, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            1)

        # Verifying second url. Gathering should be completed.
        newest_url = 'google.com/2'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('added', resp_dict['result'])
        self.assertEqual(True, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            1)

        self.assertEqual(job.get_urls_collected(), job.no_of_urls)

        # Verifying third url. Gathering should be completed but url won't be
        # added.
        newest_url = 'google.com/3'
        data = {
            'url': newest_url,
            'worker_id': worker_id,
        }

        resp = self.c.post(
            '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id),
            json.dumps(data), "text/json")
        self.assertEqual(resp.status_code, 200)

        resp_dict = json.loads(resp.content)

        self.assertTrue('result' in resp_dict.keys())
        self.assertTrue('all' in resp_dict.keys())

        self.assertEqual('', resp_dict['result'])
        self.assertEqual(True, resp_dict['all'])

        self.assertEqual(
            Sample.objects.filter(job=job,
                                  url=Sample.sanitize_url(newest_url)).count(),
            0)

        self.assertEqual(job.get_urls_collected(), job.no_of_urls)
示例#12
0
 def get_source_worker(self):
     """
         Returns a worker who has sent this sample.
     """
     return Sample.get_worker(source_type=self.source_type, source_val=self.source_val)