def __init__(self, name, url, test_username_pos, status_code=None, match_type=None, match_expr=None, test_username_neg=None, headers={}, censor_images=False, wait_time=1, use_proxy=False): ''' Constructor. ''' self.name = name self.url = url self.status_code = status_code self.match_type = match_type or 'text' self.match_expr = match_expr self.test_username_pos = test_username_pos self.headers = headers self.censor_images = censor_images self.use_proxy = use_proxy self.wait_time = wait_time if test_username_neg is None: self.test_username_neg = random_string(16) else: self.test_username_neg = test_username_neg
def delete_expired_results(): """ Delete results more than _days_to_keep_result. Sites including expired results are retested. """ worker.start_job() db_session = worker.get_session() tested_sites = set() expiry = datetime.utcnow() - timedelta(days=_days_to_keep_result) expired_results = db_session.query(Result).filter( Result.created_at < expiry).all() for result in expired_results: # Don't delete permanent image files if result.image_file.name in _permanent_images: result.image_file = None result.image_file_id = None db_session.flush() db_session.delete(result) if result.site_id not in tested_sites: tracker_id = 'tracker.{}'.format(random_string(10)) test_site.enqueue(result.site_id, tracker_id) tested_sites.add(result.site_id) db_session.commit() worker.finish_job()
def __init__(self, name, mime, user_id, content=None, zip_archive=False, zip_files=[], zip_str_files=[], access_type='private'): ''' Constructor. ''' self.name = name self.mime = mime self.user_id = user_id self.access_type = access_type # Create dummy content to use in hash if there is # no content (zip archives) if content is None: content = ('DUMMY DATA - {}' + random_string(1000)).encode('utf8') hash_ = hashlib.sha256() hash_.update(content) self.hash = hash_.digest() # Write content to file. data_dir = get_path('data') hash_hex = binascii.hexlify(self.hash).decode('ascii') dir1 = os.path.join(data_dir, hash_hex[0]) dir2 = os.path.join(dir1, hash_hex[1]) path = os.path.join(dir2, hash_hex[2:]) if not os.path.isdir(dir1): os.mkdir(dir1) if not os.path.isdir(dir2): os.mkdir(dir2) if not os.path.isfile(path): if zip_archive: self.zip_files(path, zip_files, zip_str_files) else: file_ = open(path, 'wb') file_.write(content) file_.close()
def __init__(self, name, url, test_username_pos, status_code=None, match_type=None, match_expr=None, test_username_neg=None, headers={}): ''' Constructor. ''' self.name = name self.url = url self.status_code = status_code self.match_type = match_type or 'text' self.match_expr = match_expr self.test_username_pos = test_username_pos self.headers = headers if test_username_neg is None: self.test_username_neg = random_string(16) else: self.test_username_neg = test_username_neg
def post(self): ''' Request search of usernames. **Example Request** .. sourcecode:: json { "usernames": [ "johndoe", "janedoe", ... ], "category": 3, "test": False, } **Example Response** .. sourcecode:: json { "tracker_ids": { "johndoe": "tracker.12344565", } } :<header Content-Type: application/json :<header X-Auth: the client's auth token :>json list usernames: a list of usernames to search for :>json int category: ID of site category to use (optional) :>json int site: ID of site to search (optional) :>json bool test: test results (optional, default: false) :>header Content-Type: application/json :>json list jobs: list of worker jobs :>json list jobs[n].id: unique id of this job :>json list jobs[n].usename: username target of this job :status 202: accepted for background processing :status 400: invalid request body :status 401: authentication required ''' test = False category = None category_id = None jobs = [] tracker_ids = dict() redis = g.redis request_json = request.get_json() site = None if 'usernames' not in request_json: raise BadRequest('`usernames` is required') validate_request_json(request_json, _username_attrs) if len(request_json['usernames']) == 0: raise BadRequest('At least one username is required') if 'category' in request_json and 'site' in request_json: raise BadRequest('Supply either `category` or `site`.') if 'category' in request_json: category_id = request_json['category'] category = g.db.query(Category) \ .filter(Category.id == category_id).first() if category is None: raise NotFound("Category '%s' does not exist." % category_id) else: category_id = category.id if 'site' in request_json: site_id = request_json['site'] site = g.db.query(Site).filter(Site.id == site_id).first() if site is None: raise NotFound("Site '%s' does not exist." % site_id) if 'test' in request_json: test = request_json['test'] if category: sites = category.sites elif site: sites = g.db.query(Site).filter(Site.id == site.id).all() else: sites = g.db.query(Site).all() # Only check valid sites. valid_sites = [] for site in sites: if site.valid: valid_sites.append(site) # sites = sites.filter(Site.valid == True).all() # noqa if len(valid_sites) == 0: raise NotFound('No valid sites to check') for username in request_json['usernames']: # Create an object in redis to track the number of sites completed # in this search. tracker_id = 'tracker.{}'.format(random_string(10)) tracker_ids[username] = tracker_id redis.set(tracker_id, 0) redis.expire(tracker_id, 600) total = len(valid_sites) # Queue a job for each site. for site in valid_sites: description = 'Checking {} for user "{}"'.format(site.name, username) job = worker.scrape.check_username.enqueue( username=username, site_id=site.id, category_id=category_id, total=total, tracker_id=tracker_id, test=test, jobdesc=description, timeout=_redis_worker['username_timeout'], user_id=g.user.id ) jobs.append({ 'id': job.id, 'username': username, 'category': category_id, }) response = jsonify(tracker_ids=tracker_ids) response.status_code = 202 return response
def post_jobs_for_site(self, site_id): """ Request background jobs for site identified by `id`. **Example Request** ..sourcode:: json { "jobs": [ { "name": "test", }, ... ] } **Example Response** .. sourcecode:: json { "tracker_ids": { "1": "tracker.12344565", } } :<header Content-Type: application/json :<header X-Auth: the client's auth token :>json list jobs: a list of jobs to schedule :>json string jobs[n].name: name of job :>header Content-Type: application/json :>json array tracker_ids: array of worker tracking ids {site ID: tracker ID} :status 202: scheduled :status 400: invalid request body :status 401: authentication required """ request_attrs = { 'jobs': { 'type': list, 'required': True }, } job_attrs = { 'name': { 'type': str, 'required': True }, } available_jobs = ['test'] tracker_ids = dict() # Get site. site_id = get_int_arg('site_id', site_id) site = g.db.query(Site).filter(Site.id == site_id).first() # Validate if site is None: raise NotFound("Site '%s' does not exist." % site_id) request_json = request.get_json() validate_request_json(request_json, request_attrs) for job in request_json['jobs']: validate_json_attr('name', job_attrs, job) if job['name'] not in available_jobs: raise BadRequest('`{}` does not exist in available' ' jobs: {}'.format(job['name'], ','.join(available_jobs))) # Schedule jobs for job in request_json['jobs']: tracker_id = 'tracker.{}'.format(random_string(10)) tracker_ids[site.id] = tracker_id if job['name'] == 'test': description = 'Testing site "{}"'.format(site.name) worker.scrape.test_site.enqueue( site_id=site.id, tracker_id=tracker_id, jobdesc=description, user_id=g.user.id, ) response = jsonify(tracker_ids=tracker_ids) response.status_code = 202 return response
def post(self): ''' Request search of usernames. **Example Request** .. sourcecode:: json { "usernames": [ "johndoe", "janedoe", ... ], "group": 3, "test": False, } **Example Response** .. sourcecode:: json { "tracker_ids": { "johndoe": "tracker.12344565", } } :<header Content-Type: application/json :<header X-Auth: the client's auth token :>json list usernames: a list of usernames to search for :>json int group: ID of site group to use (optional) :>json int site: ID of site to search (optional) :>json bool test: test results (optional, default: false) :>header Content-Type: application/json :>json list jobs: list of worker jobs :>json list jobs[n].id: unique id of this job :>json list jobs[n].usename: username target of this job :status 202: accepted for background processing :status 400: invalid request body :status 401: authentication required ''' test = False group = None group_id = None jobs = [] tracker_ids = dict() redis = g.redis request_json = request.get_json() site = None if 'usernames' not in request_json: raise BadRequest('`usernames` is required') validate_request_json(request_json, USERNAME_ATTRS) if len(request_json['usernames']) == 0: raise BadRequest('At least one username is required') if 'group' in request_json and 'site' in request_json: raise BadRequest('Supply either `group` or `site`.') if 'group' in request_json: group_id = request_json['group'] group = g.db.query(Group).filter(Group.id == group_id).first() if group is None: raise NotFound("Group '%s' does not exist." % group_id) else: group_id = group.id if 'site' in request_json: site_id = request_json['site'] site = g.db.query(Site).filter(Site.id == site_id).first() if site is None: raise NotFound("Site '%s' does not exist." % site_id) if 'test' in request_json: test = request_json['test'] if group: sites = group.sites elif site: sites = g.db.query(Site).filter(Site.id == site.id) else: sites = g.db.query(Site) # Only check valid sites. sites = sites.filter(Site.valid == True).all() # noqa if len(sites) == 0: raise NotFound('No valid sites to check') for username in request_json['usernames']: # Create an object in redis to track the number of sites completed # in this search. tracker_id = 'tracker.{}'.format(random_string(10)) tracker_ids[username] = tracker_id redis.set(tracker_id, 0) redis.expire(tracker_id, 600) total = len(sites) # Queue a job for each site. for site in sites: job_id = app.queue.schedule_username( username=username, site=site, group_id=group_id, total=total, tracker_id=tracker_id, test=test ) jobs.append({ 'id': job_id, 'username': username, 'group': group_id, }) response = jsonify(tracker_ids=tracker_ids) response.status_code = 202 return response
def post_jobs_for_sites(self): """ Request background jobs for all sites. **Example Request** ..sourcode:: json { "jobs": [ { "name": "test", }, ... ] } **Example Response** .. sourcecode:: json { "tracker_ids": { "1": "tracker.12344565", } } :<header Content-Type: application/json :<header X-Auth: the client's auth token :>json list jobs: a list of jobs to schedule :>json string jobs[n].name: name of job :>header Content-Type: application/json :>json array tracker_ids: array of worker tracking ids :status 202: scheduled :status 400: invalid request body :status 401: authentication required """ request_attrs = { 'jobs': { 'type': list, 'required': True }, } job_attrs = { 'name': { 'type': str, 'required': True }, } available_jobs = ['test'] tracker_ids = dict() request_json = request.get_json() validate_request_json(request_json, request_attrs) for job in request_json['jobs']: validate_json_attr('name', job_attrs, job) if job['name'] not in available_jobs: raise BadRequest('`{}` does not exist in available' ' jobs: {}'.format(job['name'], ','.join(available_jobs))) # Get sites. sites = g.db.query(Site).all() # Schedule jobs for job in request_json['jobs']: for site in sites: tracker_id = 'tracker.{}'.format(random_string(10)) tracker_ids[site.id] = tracker_id if job['name'] == 'test': app.queue.schedule_site_test( site=site, tracker_id=tracker_id, ) response = jsonify(tracker_ids=tracker_ids) response.status_code = 202 return response